diff --git a/antora/modules/ROOT/nav.adoc b/antora/modules/ROOT/nav.adoc index 89c2dc6f..49041e50 100644 --- a/antora/modules/ROOT/nav.adoc +++ b/antora/modules/ROOT/nav.adoc @@ -149,3 +149,51 @@ *** xref:Building_a_Simple_Engine/Advanced_Topics/Robustness2.adoc[Robustness2] ** Appendix *** xref:Building_a_Simple_Engine/Appendix/appendix.adoc[Appendix] + +* Synchronization 2 +** xref:Synchronization/introduction.adoc[Introduction] +** Anatomy of a Dependency +*** xref:Synchronization/Anatomy_of_a_Dependency/01_introduction.adoc[Introduction] +*** xref:Synchronization/Anatomy_of_a_Dependency/02_execution_vs_memory.adoc[Execution vs. Memory] +*** xref:Synchronization/Anatomy_of_a_Dependency/03_sync2_advantage.adoc[Sync 2 Advantage] +*** xref:Synchronization/Anatomy_of_a_Dependency/04_refined_pipeline_stages.adoc[Refined Pipeline Stages] +*** xref:Synchronization/Anatomy_of_a_Dependency/05_conclusion.adoc[Conclusion] +** Pipeline Barriers and Transitions +*** xref:Synchronization/Pipeline_Barriers_Transitions/01_introduction.adoc[Introduction] +*** xref:Synchronization/Pipeline_Barriers_Transitions/02_image_barrier.adoc[The Image Barrier] +*** xref:Synchronization/Pipeline_Barriers_Transitions/03_queue_family_ownership.adoc[Queue Family Ownership] +*** xref:Synchronization/Pipeline_Barriers_Transitions/04_global_vs_local_barriers.adoc[Global vs. Local Barriers] +** Timeline Semaphores +*** xref:Synchronization/Timeline_Semaphores/01_introduction.adoc[Introduction] +*** xref:Synchronization/Timeline_Semaphores/02_unifying_sync.adoc[Unifying Sync] +*** xref:Synchronization/Timeline_Semaphores/03_monotonic_counter.adoc[Monotonic Counter] +*** xref:Synchronization/Timeline_Semaphores/04_wait_before_signal.adoc[Wait Before Signal] +** Frame-in-Flight Architecture +*** xref:Synchronization/Frame_in_Flight/01_introduction.adoc[Introduction] +*** xref:Synchronization/Frame_in_Flight/02_managing_concurrent_frames.adoc[Managing Concurrent Frames] +*** xref:Synchronization/Frame_in_Flight/03_resource_lifetimes.adoc[Resource Lifetimes] +** Asynchronous Compute & Overlap +*** xref:Synchronization/Async_Compute_Overlap/01_introduction.adoc[Introduction] +*** xref:Synchronization/Async_Compute_Overlap/02_maximizing_throughput.adoc[Maximizing Throughput] +*** xref:Synchronization/Async_Compute_Overlap/03_async_post_processing.adoc[Async Post-processing] +*** xref:Synchronization/Async_Compute_Overlap/04_bubble_problem.adoc[The Bubble Problem] +** Transfer Queues & Asset Streaming Sync +*** xref:Synchronization/Transfer_Queues_Streaming/01_introduction.adoc[Introduction] +*** xref:Synchronization/Transfer_Queues_Streaming/02_non_blocking_uploads.adoc[Non-blocking Uploads] +*** xref:Synchronization/Transfer_Queues_Streaming/03_staging_sync.adoc[Staging Sync] +** Synchronization in Dynamic Rendering +*** xref:Synchronization/Dynamic_Rendering_Sync/01_introduction.adoc[Introduction] +*** xref:Synchronization/Dynamic_Rendering_Sync/02_subpass_replacement.adoc[Subpass Replacement] +*** xref:Synchronization/Dynamic_Rendering_Sync/03_local_read_sync.adoc[Local Read Sync] +** Host Image Copies & Memory Mapped Sync +*** xref:Synchronization/Host_Image_Copies_Memory_Sync/01_introduction.adoc[Introduction] +*** xref:Synchronization/Host_Image_Copies_Memory_Sync/02_cpu_to_image_access.adoc[CPU-to-Image Access] +*** xref:Synchronization/Host_Image_Copies_Memory_Sync/03_visibility_flushes.adoc[Visibility Flushes] +** Debugging with Synchronization Validation +*** xref:Synchronization/Synchronization_Validation/01_introduction.adoc[Introduction] +*** xref:Synchronization/Synchronization_Validation/02_validation_layer.adoc[Validation Layer] +*** xref:Synchronization/Synchronization_Validation/03_interpreting_vuids.adoc[Interpreting VUIDs] +** Profiling, Batching, and Optimization +*** xref:Synchronization/Profiling_Optimization/01_introduction.adoc[Introduction] +*** xref:Synchronization/Profiling_Optimization/02_barrier_batching.adoc[Barrier Batching] +*** xref:Synchronization/Profiling_Optimization/03_visualizing_stalls.adoc[Visualizing Stalls] diff --git a/attachments/sync2_engine/CMakeLists.txt b/attachments/sync2_engine/CMakeLists.txt new file mode 100644 index 00000000..3ee1acc4 --- /dev/null +++ b/attachments/sync2_engine/CMakeLists.txt @@ -0,0 +1,306 @@ +cmake_minimum_required(VERSION 3.29) + +project(SimpleEngine VERSION 1.0.0 LANGUAGES CXX C) + +# Option to enable/disable Vulkan C++20 module support for this standalone project +option(ENABLE_CPP20_MODULE "Enable C++ 20 module support for Vulkan in SimpleEngine" OFF) + +# Enable C++ module dependency scanning only when modules are enabled +if(ENABLE_CPP20_MODULE) + set(CMAKE_CXX_SCAN_FOR_MODULES ON) +endif() + +# Add CMake module path for custom find modules +set(SIMPLE_ENGINE_DIR "${CMAKE_CURRENT_LIST_DIR}/../simple_engine") +list(APPEND CMAKE_MODULE_PATH "${SIMPLE_ENGINE_DIR}/CMake") + +# Find required packages +find_package (glm REQUIRED) +find_package (Vulkan REQUIRED) +find_package (tinygltf REQUIRED) +find_package (KTX REQUIRED) + +# Find or download Vulkan-Hpp headers matching the Vulkan SDK/NDK version +find_package(VulkanHpp REQUIRED) + +if(ENABLE_CPP20_MODULE) + # Set up Vulkan C++ module for this standalone project + add_library(VulkanCppModule) + add_library(Vulkan::cppm ALIAS VulkanCppModule) + + target_compile_definitions(VulkanCppModule + PUBLIC VULKAN_HPP_DISPATCH_LOADER_DYNAMIC=1 VULKAN_HPP_NO_STRUCT_CONSTRUCTORS=1 + ) + target_include_directories(VulkanCppModule + PUBLIC + "${Vulkan_INCLUDE_DIR}" + "${VulkanHpp_INCLUDE_DIRS}" + ) + target_link_libraries(VulkanCppModule + PUBLIC + Vulkan::Vulkan + ) + + set_target_properties(VulkanCppModule PROPERTIES CXX_STANDARD 20) + + target_sources(VulkanCppModule + PUBLIC + FILE_SET cxx_modules TYPE CXX_MODULES + BASE_DIRS + "${VulkanHpp_CPPM_DIR}" + FILES + "${VulkanHpp_CPPM_DIR}/vulkan/vulkan.cppm" + ) + + # MSVC-specific options to improve module support + if(MSVC) + target_compile_options(VulkanCppModule PRIVATE + /std:c++latest + /permissive- + /Zc:__cplusplus + /EHsc + /Zc:preprocessor + ) + endif() +else() + add_library(VulkanCppModule INTERFACE) + add_library(Vulkan::cppm ALIAS VulkanCppModule) + target_link_libraries(VulkanCppModule INTERFACE Vulkan::Vulkan) + target_compile_definitions(VulkanCppModule + INTERFACE VULKAN_HPP_DISPATCH_LOADER_DYNAMIC=1 VULKAN_HPP_NO_STRUCT_CONSTRUCTORS=1 + ) + target_include_directories(VulkanCppModule INTERFACE "${VulkanHpp_INCLUDE_DIRS}") +endif() + + + +# Platform-specific settings +if(ANDROID) + # Android-specific settings + add_definitions(-DPLATFORM_ANDROID) + find_package(game-activity REQUIRED CONFIG) +else() + # Desktop-specific settings + add_definitions(-DPLATFORM_DESKTOP) + find_package(glfw3 REQUIRED) + find_package(OpenAL REQUIRED) +endif() + +# Shader compilation +# Find Slang shaders from simple_engine (exclude utility modules) +file(GLOB SLANG_SHADER_SOURCES ${SIMPLE_ENGINE_DIR}/shaders/*.slang) +list(FILTER SLANG_SHADER_SOURCES EXCLUDE REGEX ".*/(common_types|pbr_utils|lighting_utils|tonemapping_utils)\\.slang$") + +# Find slangc executable (optional) +find_program(SLANGC_EXECUTABLE slangc HINTS $ENV{VULKAN_SDK}/bin) + +if(SLANGC_EXECUTABLE) + # Ensure the output directory for compiled shaders exists + file(MAKE_DIRECTORY ${CMAKE_CURRENT_BINARY_DIR}/shaders) + + # Compile Slang shaders using slangc + foreach(SHADER ${SLANG_SHADER_SOURCES}) + get_filename_component(SHADER_NAME ${SHADER} NAME) + get_filename_component(SHADER_NAME_WE ${SHADER_NAME} NAME_WE) + string(REGEX REPLACE "\.slang$" "" OUTPUT_NAME ${SHADER_NAME}) + add_custom_command( + OUTPUT ${CMAKE_CURRENT_BINARY_DIR}/shaders/${OUTPUT_NAME}.spv + COMMAND ${SLANGC_EXECUTABLE} ${SHADER} -target spirv -profile spirv_1_4 -emit-spirv-directly -o ${CMAKE_CURRENT_BINARY_DIR}/shaders/${OUTPUT_NAME}.spv + DEPENDS ${SHADER} + COMMENT "Compiling Slang shader ${SHADER_NAME} with slangc" + ) + list(APPEND SHADER_SPVS ${CMAKE_CURRENT_BINARY_DIR}/shaders/${OUTPUT_NAME}.spv) + endforeach() + + add_custom_target(shaders DEPENDS ${SHADER_SPVS}) +else() + message(STATUS "slangc not found. Skipping shader compilation step.") + add_custom_target(shaders) +endif() + +# Source files +# NOTE: Android builds include this project via `add_subdirectory(...)` from +# `android/app/src/main/cpp/CMakeLists.txt`, so we must not require a desktop `main()`. +set(SOURCES_COMMON + engine.cpp + scene_loading.cpp + ${SIMPLE_ENGINE_DIR}/platform.cpp + renderer_core.cpp + renderer_rendering.cpp + renderer_pipelines.cpp + renderer_compute.cpp + renderer_utils.cpp + renderer_resources.cpp + renderer_ray_query.cpp + memory_pool.cpp + ${SIMPLE_ENGINE_DIR}/resource_manager.cpp + ${SIMPLE_ENGINE_DIR}/entity.cpp + ${SIMPLE_ENGINE_DIR}/component.cpp + ${SIMPLE_ENGINE_DIR}/transform_component.cpp + ${SIMPLE_ENGINE_DIR}/mesh_component.cpp + ${SIMPLE_ENGINE_DIR}/camera_component.cpp + ${SIMPLE_ENGINE_DIR}/animation_component.cpp + model_loader.cpp + audio_system.cpp + physics_system.cpp + imgui_system.cpp + ${SIMPLE_ENGINE_DIR}/imgui/imgui.cpp + ${SIMPLE_ENGINE_DIR}/imgui/imgui_draw.cpp + ${SIMPLE_ENGINE_DIR}/vulkan_device.cpp + pipeline.cpp + ${SIMPLE_ENGINE_DIR}/descriptor_manager.cpp + ${SIMPLE_ENGINE_DIR}/renderdoc_debug_system.cpp + ${SIMPLE_ENGINE_DIR}/mikktspace.c +) + +set(SOURCES_DESKTOP + main.cpp +) + +# Create target +if (ANDROID) + # Android: build the engine as a library to be linked into the app's `simple_engine_android` SHARED library. + add_library(SimpleEngine STATIC ${SOURCES_COMMON}) +else () + # Desktop: build the runnable executable (unchanged behavior vs `HEAD`). + add_executable(SimpleEngine ${SOURCES_COMMON} ${SOURCES_DESKTOP}) +endif () + +target_include_directories(SimpleEngine PRIVATE + ${CMAKE_CURRENT_SOURCE_DIR} + ${SIMPLE_ENGINE_DIR} + ${SIMPLE_ENGINE_DIR}/imgui +) + +add_dependencies(SimpleEngine shaders) +set_target_properties (SimpleEngine PROPERTIES CXX_STANDARD 20) + +# Enable required defines for GLM experimental extensions and MSVC math constants +target_compile_definitions(SimpleEngine PRIVATE + GLM_ENABLE_EXPERIMENTAL + _USE_MATH_DEFINES + VULKAN_HPP_NO_STRUCT_CONSTRUCTORS + VULKAN_HPP_DISPATCH_LOADER_DYNAMIC +) + +# Link libraries +# Prefer the Vulkan C++ module target when available (configured at the parent level), +# but fall back to the standard Vulkan library otherwise. +if(TARGET Vulkan::cppm) + target_link_libraries(SimpleEngine PUBLIC Vulkan::cppm) +else() + target_link_libraries(SimpleEngine PUBLIC Vulkan::Vulkan) +endif() + +target_link_libraries(SimpleEngine PUBLIC + glm::glm + tinygltf::tinygltf + KTX::ktx +) + +if (ANDROID) + target_link_libraries(SimpleEngine PUBLIC game-activity::game-activity OpenSLES android log) +else () + target_link_libraries(SimpleEngine PRIVATE glfw OpenAL::OpenAL) +endif() + +# Windows/MSVC portability and build settings +if(MSVC) + # Avoid Windows.h macro pollution and CRT warnings; improve conformance and build perf + target_compile_definitions(SimpleEngine PRIVATE + NOMINMAX + WIN32_LEAN_AND_MEAN + _CRT_SECURE_NO_WARNINGS + ) + target_compile_options(SimpleEngine PRIVATE + /permissive- + /Zc:__cplusplus + /EHsc + /W3 + /MP + /bigobj + ) + # Crash reporter uses Dbghelp; pragma should suffice, but make it explicit for clarity + target_link_libraries(SimpleEngine PRIVATE Dbghelp) +elseif(WIN32) + # Non-MSVC Windows toolchains (e.g., MinGW) + target_compile_definitions(SimpleEngine PRIVATE + NOMINMAX + WIN32_LEAN_AND_MEAN + _CRT_SECURE_NO_WARNINGS + ) +endif() + +# Copy model and texture files from simple_engine if they exist +if(EXISTS ${SIMPLE_ENGINE_DIR}/models) + if (NOT ANDROID) + add_custom_command(TARGET SimpleEngine POST_BUILD + COMMAND ${CMAKE_COMMAND} -E copy_directory ${SIMPLE_ENGINE_DIR}/models ${CMAKE_RUNTIME_OUTPUT_DIRECTORY}/models + COMMENT "Copying models to output directory" + ) +endif() +endif () + +if(EXISTS ${SIMPLE_ENGINE_DIR}/textures) + if (NOT ANDROID) + add_custom_command(TARGET SimpleEngine POST_BUILD + COMMAND ${CMAKE_COMMAND} -E copy_directory ${SIMPLE_ENGINE_DIR}/textures ${CMAKE_RUNTIME_OUTPUT_DIRECTORY}/textures + COMMENT "Copying textures to output directory" + ) +endif() +endif () + +# Add packaging configuration +include(CPack) + +# Set package properties +set(CPACK_PACKAGE_NAME "SimpleEngine") +set(CPACK_PACKAGE_VENDOR "SimpleEngine Team") +set(CPACK_PACKAGE_DESCRIPTION_SUMMARY "A simple game engine built with Vulkan") +set(CPACK_PACKAGE_VERSION "1.0.0") +set(CPACK_PACKAGE_VERSION_MAJOR "1") +set(CPACK_PACKAGE_VERSION_MINOR "0") +set(CPACK_PACKAGE_VERSION_PATCH "0") +set(CPACK_PACKAGE_INSTALL_DIRECTORY "SimpleEngine") + +# Set platform-specific package generators +if(WIN32) + set(CPACK_GENERATOR "ZIP;NSIS") + set(CPACK_NSIS_PACKAGE_NAME "SimpleEngine") + set(CPACK_NSIS_DISPLAY_NAME "SimpleEngine") + set(CPACK_NSIS_HELP_LINK "https://github.com/yourusername/SimpleEngine") + set(CPACK_NSIS_URL_INFO_ABOUT "https://github.com/yourusername/SimpleEngine") + set(CPACK_NSIS_CONTACT "your.email@example.com") + set(CPACK_NSIS_MODIFY_PATH ON) +elseif(APPLE) + set(CPACK_GENERATOR "ZIP;DragNDrop") + set(CPACK_DMG_VOLUME_NAME "SimpleEngine") + set(CPACK_DMG_FORMAT "UDBZ") +else() + set(CPACK_GENERATOR "ZIP;DEB") + set(CPACK_DEBIAN_PACKAGE_MAINTAINER "Your Name ") + set(CPACK_DEBIAN_PACKAGE_SECTION "games") + set(CPACK_DEBIAN_PACKAGE_DEPENDS "libvulkan1, libglfw3, libglm-dev, libktx-dev") +endif() + +# Include binary and resource directories in the package +if (NOT ANDROID) +install(TARGETS SimpleEngine DESTINATION bin) +if(SLANGC_EXECUTABLE) + install(DIRECTORY ${CMAKE_CURRENT_BINARY_DIR}/shaders DESTINATION share/SimpleEngine) +endif() + +# Install models and textures if they exist in simple_engine +if(EXISTS ${SIMPLE_ENGINE_DIR}/models) + install(DIRECTORY ${SIMPLE_ENGINE_DIR}/models DESTINATION share/SimpleEngine) +endif() + +if(EXISTS ${SIMPLE_ENGINE_DIR}/textures) + install(DIRECTORY ${SIMPLE_ENGINE_DIR}/textures DESTINATION share/SimpleEngine) +endif() + +# Install README from simple_engine if it exists +if(EXISTS ${SIMPLE_ENGINE_DIR}/README.md) + install(FILES ${SIMPLE_ENGINE_DIR}/README.md DESTINATION share/SimpleEngine) +endif() +endif () diff --git a/attachments/sync2_engine/audio_system.cpp b/attachments/sync2_engine/audio_system.cpp new file mode 100644 index 00000000..432285f8 --- /dev/null +++ b/attachments/sync2_engine/audio_system.cpp @@ -0,0 +1,1827 @@ +/* Copyright (c) 2025 Holochip Corporation + * + * SPDX-License-Identifier: Apache-2.0 + * + * Licensed under the Apache License, Version 2.0 the "License"; + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +#include "audio_system.h" + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#if defined(PLATFORM_ANDROID) +# include +# include +#else +// OpenAL headers +# ifdef __APPLE__ +# include +# include +# else +# include +# include +# endif +#endif + +#include "engine.h" +#include "renderer.h" + +#if !defined(PLATFORM_ANDROID) +// OpenAL error checking utility +static void CheckOpenALError(const std::string& operation) { + ALenum error = alGetError(); + if (error != AL_NO_ERROR) { + std::cerr << "OpenAL Error in " << operation << ": "; + switch (error) { + case AL_INVALID_NAME: + std::cerr << "AL_INVALID_NAME"; + break; + case AL_INVALID_ENUM: + std::cerr << "AL_INVALID_ENUM"; + break; + case AL_INVALID_VALUE: + std::cerr << "AL_INVALID_VALUE"; + break; + case AL_INVALID_OPERATION: + std::cerr << "AL_INVALID_OPERATION"; + break; + case AL_OUT_OF_MEMORY: + std::cerr << "AL_OUT_OF_MEMORY"; + break; + default: + std::cerr << "Unknown error " << error; + break; + } + std::cerr << std::endl; + } +} +#endif + +// Concrete implementation of AudioSource +class ConcreteAudioSource : public AudioSource { + public: + explicit ConcreteAudioSource(std::string name) : name(std::move(name)) { + } + ~ConcreteAudioSource() override = default; + + void Play() override { + playing = true; + playbackPosition = 0; + delayTimer = std::chrono::milliseconds(0); + inDelayPhase = false; + sampleAccumulator = 0.0; + } + + void Pause() override { + playing = false; + } + + void Stop() override { + playing = false; + playbackPosition = 0; + delayTimer = std::chrono::milliseconds(0); + inDelayPhase = false; + sampleAccumulator = 0.0; + } + + void SetVolume(float volume) override { + this->volume = volume; + } + + void SetLoop(bool loop) override { + this->loop = loop; + } + + void SetPosition(float x, float y, float z) override { + position[0] = x; + position[1] = y; + position[2] = z; + } + + void SetVelocity(float x, float y, float z) override { + velocity[0] = x; + velocity[1] = y; + velocity[2] = z; + } + + [[nodiscard]] bool IsPlaying() const override { + return playing; + } + + // Additional methods for delay functionality + void SetAudioLength(uint32_t lengthInSamples) { + audioLengthSamples = lengthInSamples; + } + + void UpdatePlayback(std::chrono::milliseconds deltaTime, uint32_t samplesProcessed) { + if (!playing) + return; + + if (inDelayPhase) { + // We're in the delay phase between playthroughs + delayTimer += deltaTime; + if (delayTimer >= delayDuration) { + // Delay finished, restart playback + inDelayPhase = false; + playbackPosition = 0; + delayTimer = std::chrono::milliseconds(0); + } + } else { + // Normal playback, update position + playbackPosition += samplesProcessed; + + // Check if we've reached the end of the audio + if (audioLengthSamples > 0 && playbackPosition >= audioLengthSamples) { + if (loop) { + // Start the delay phase before looping + inDelayPhase = true; + delayTimer = std::chrono::milliseconds(0); + } else { + // Stop playing if not looping + playing = false; + playbackPosition = 0; + } + } + } + } + + [[nodiscard]] bool ShouldProcessAudio() const { + return playing && !inDelayPhase; + } + + [[nodiscard]] uint32_t GetPlaybackPosition() const { + return playbackPosition; + } + + [[nodiscard]] const std::string& GetName() const { + return name; + } + + [[nodiscard]] const float* GetPosition() const { + return position; + } + + [[nodiscard]] double GetSampleAccumulator() const { + return sampleAccumulator; + } + + void SetSampleAccumulator(double value) { + sampleAccumulator = value; + } + + private: + std::string name; + bool playing = false; + bool loop = false; + float volume = 1.0f; + float position[3] = {0.0f, 0.0f, 0.0f}; + float velocity[3] = {0.0f, 0.0f, 0.0f}; + + // Delay and timing functionality + uint32_t playbackPosition = 0; // Current position in samples + uint32_t audioLengthSamples = 0; // Total length of audio in samples + std::chrono::milliseconds delayTimer = std::chrono::milliseconds(0); // Timer for delay between loops + bool inDelayPhase = false; // Whether we're currently in the delay phase + static constexpr std::chrono::milliseconds delayDuration = std::chrono::milliseconds(1500); // 1.5-second delay between loops + double sampleAccumulator = 0.0; // Per-source sample accumulator for proper timing +}; + +#if defined(PLATFORM_ANDROID) + +// OpenSL ES audio output device implementation +class OpenSLESAudioOutputDevice : public AudioOutputDevice { + public: + OpenSLESAudioOutputDevice() = default; + ~OpenSLESAudioOutputDevice() override { + Stop(); + Cleanup(); + } + + bool Initialize(uint32_t sampleRate, uint32_t channels, uint32_t bufferSize) override { + this->sampleRate = sampleRate; + this->channels = channels == 0 ? 2u : channels; + this->bufferSize = bufferSize == 0 ? 1024u : bufferSize; + + // Create and realize engine + SLresult result = slCreateEngine(&engineObject, 0, nullptr, 0, nullptr, nullptr); + if (result != SL_RESULT_SUCCESS) { + LOGE("OpenSLES: slCreateEngine failed (%d)", result); + return false; + } + result = (*engineObject)->Realize(engineObject, SL_BOOLEAN_FALSE); + if (result != SL_RESULT_SUCCESS) { + LOGE("OpenSLES: Engine Realize failed (%d)", result); + Cleanup(); + return false; + } + result = (*engineObject)->GetInterface(engineObject, SL_IID_ENGINE, &engineEngine); + if (result != SL_RESULT_SUCCESS) { + LOGE("OpenSLES: GetInterface SL_IID_ENGINE failed (%d)", result); + Cleanup(); + return false; + } + + // Create output mix + result = (*engineEngine)->CreateOutputMix(engineEngine, &outputMixObject, 0, nullptr, nullptr); + if (result != SL_RESULT_SUCCESS) { + LOGE("OpenSLES: CreateOutputMix failed (%d)", result); + Cleanup(); + return false; + } + result = (*outputMixObject)->Realize(outputMixObject, SL_BOOLEAN_FALSE); + if (result != SL_RESULT_SUCCESS) { + LOGE("OpenSLES: OutputMix Realize failed (%d)", result); + Cleanup(); + return false; + } + + // Configure source: buffer queue + PCM format + SLDataLocator_AndroidSimpleBufferQueue loc_bufq{SL_DATALOCATOR_ANDROIDSIMPLEBUFFERQUEUE, (SLuint32) NUM_BUFFERS}; + SLDataFormat_PCM format_pcm{}; + format_pcm.formatType = SL_DATAFORMAT_PCM; + format_pcm.numChannels = (SLuint32) this->channels; + format_pcm.samplesPerSec = ToSLSampleRate(this->sampleRate); + format_pcm.bitsPerSample = SL_PCMSAMPLEFORMAT_FIXED_16; + format_pcm.containerSize = 16; + format_pcm.channelMask = (this->channels == 1) ? (SL_SPEAKER_FRONT_CENTER) : (SL_SPEAKER_FRONT_LEFT | SL_SPEAKER_FRONT_RIGHT); + format_pcm.endianness = SL_BYTEORDER_LITTLEENDIAN; + + SLDataSource audioSrc{&loc_bufq, &format_pcm}; + + // Sink: OutputMix + SLDataLocator_OutputMix loc_outmix{SL_DATALOCATOR_OUTPUTMIX, outputMixObject}; + SLDataSink audioSnk{&loc_outmix, nullptr}; + + // Create audio player; request buffer queue interface + const SLInterfaceID ids[] = {SL_IID_BUFFERQUEUE}; + const SLboolean req[] = {SL_BOOLEAN_TRUE}; + result = (*engineEngine)->CreateAudioPlayer(engineEngine, &playerObject, &audioSrc, &audioSnk, (SLuint32)(sizeof(ids) / sizeof(ids[0])), ids, req); + if (result != SL_RESULT_SUCCESS) { + LOGE("OpenSLES: CreateAudioPlayer failed (%d)", result); + Cleanup(); + return false; + } + result = (*playerObject)->Realize(playerObject, SL_BOOLEAN_FALSE); + if (result != SL_RESULT_SUCCESS) { + LOGE("OpenSLES: Player Realize failed (%d)", result); + Cleanup(); + return false; + } + + // Interfaces + result = (*playerObject)->GetInterface(playerObject, SL_IID_PLAY, &playItf); + if (result != SL_RESULT_SUCCESS) { + LOGE("OpenSLES: GetInterface SL_IID_PLAY failed (%d)", result); + Cleanup(); + return false; + } + result = (*playerObject)->GetInterface(playerObject, SL_IID_BUFFERQUEUE, &bufferQueueItf); + if (result != SL_RESULT_SUCCESS) { + LOGE("OpenSLES: GetInterface SL_IID_BUFFERQUEUE failed (%d)", result); + Cleanup(); + return false; + } + + // Setup buffers + pcmBuffers.assign(NUM_BUFFERS, std::vector(this->bufferSize * this->channels)); + nextBufferIndex = 0; + + // Register callback + result = (*bufferQueueItf)->RegisterCallback(bufferQueueItf, &OpenSLESAudioOutputDevice::BufferQueueCallback, this); + if (result != SL_RESULT_SUCCESS) { + LOGE("OpenSLES: RegisterCallback failed (%d)", result); + Cleanup(); + return false; + } + + initialized = true; + return true; + } + + bool Start() override { + if (!initialized) + return false; + if (playing) + return true; + + playing = true; + SLresult result = (*playItf)->SetPlayState(playItf, SL_PLAYSTATE_PLAYING); + if (result != SL_RESULT_SUCCESS) { + LOGE("OpenSLES: SetPlayState PLAYING failed (%d)", result); + return false; + } + + // Enqueue initial buffers to kick off the callback chain + for (int i = 0; i < 2; ++i) { + EnqueueNextBuffer(); + } + + return true; + } + + bool Stop() override { + if (!initialized) + return true; + playing = false; + if (playItf) { + (*playItf)->SetPlayState(playItf, SL_PLAYSTATE_STOPPED); + } + if (bufferQueueItf) { + (*bufferQueueItf)->Clear(bufferQueueItf); + } + return true; + } + + bool WriteAudio(const float* data, uint32_t sampleCount) override { + if (!initialized) + return false; + std::lock_guard lock(bufferMutex); + const uint32_t total = sampleCount * channels; + for (uint32_t i = 0; i < total; ++i) { + audioQueue.push(data[i]); + } + return true; + } + + bool IsPlaying() const override { + return playing; + } + uint32_t GetPosition() const override { + return playbackPosition; + } + + private: + static constexpr int NUM_BUFFERS = 4; + uint32_t sampleRate = 44100; + uint32_t channels = 2; + uint32_t bufferSize = 1024; + bool initialized = false; + std::atomic playing{false}; + uint32_t playbackPosition = 0; + + SLObjectItf engineObject = nullptr; + SLEngineItf engineEngine = nullptr; + SLObjectItf outputMixObject = nullptr; + SLObjectItf playerObject = nullptr; + SLPlayItf playItf = nullptr; + SLAndroidSimpleBufferQueueItf bufferQueueItf = nullptr; + + std::vector> pcmBuffers; + int nextBufferIndex = 0; + std::queue audioQueue; + std::mutex bufferMutex; + + static SLuint32 ToSLSampleRate(uint32_t rate) { + switch (rate) { + case 44100: + return SL_SAMPLINGRATE_44_1; + case 48000: + return SL_SAMPLINGRATE_48; + default: + return SL_SAMPLINGRATE_44_1; + } + } + + static void BufferQueueCallback(SLAndroidSimpleBufferQueueItf, void* context) { + auto* self = static_cast(context); + if (self && self->playing) { + self->EnqueueNextBuffer(); + } + } + + void EnqueueNextBuffer() { + std::lock_guard lock(bufferMutex); + auto& buf = pcmBuffers[nextBufferIndex]; + const uint32_t totalSamples = bufferSize * channels; + + for (uint32_t i = 0; i < totalSamples; ++i) { + if (!audioQueue.empty()) { + float s = audioQueue.front(); + audioQueue.pop(); + buf[i] = static_cast(std::clamp(s, -1.0f, 1.0f) * 32767.0f); + } else { + buf[i] = 0; + } + } + + (*bufferQueueItf)->Enqueue(bufferQueueItf, buf.data(), totalSamples * sizeof(int16_t)); + playbackPosition += bufferSize; + nextBufferIndex = (nextBufferIndex + 1) % NUM_BUFFERS; + } + + void Cleanup() { + if (playerObject) { + (*playerObject)->Destroy(playerObject); + playerObject = nullptr; + } + if (outputMixObject) { + (*outputMixObject)->Destroy(outputMixObject); + outputMixObject = nullptr; + } + if (engineObject) { + (*engineObject)->Destroy(engineObject); + engineObject = nullptr; + } + } +}; + +#else + +// OpenAL audio output device implementation +class OpenALAudioOutputDevice : public AudioOutputDevice { + public: + OpenALAudioOutputDevice() = default; + ~OpenALAudioOutputDevice() override { + OpenALAudioOutputDevice::Stop(); + Cleanup(); + } + + bool Initialize(uint32_t sampleRate, uint32_t channels, uint32_t bufferSize) override { + this->sampleRate = sampleRate; + this->channels = channels; + this->bufferSize = bufferSize; + + // Initialize OpenAL + device = alcOpenDevice(nullptr); // Use default device + if (!device) { + std::cerr << "Failed to open OpenAL device" << std::endl; + return false; + } + + context = alcCreateContext(device, nullptr); + if (!context) { + std::cerr << "Failed to create OpenAL context" << std::endl; + alcCloseDevice(device); + device = nullptr; + return false; + } + + if (!alcMakeContextCurrent(context)) { + std::cerr << "Failed to make OpenAL context current" << std::endl; + alcDestroyContext(context); + alcCloseDevice(device); + context = nullptr; + device = nullptr; + return false; + } + + // Generate OpenAL source + alGenSources(1, &source); + CheckOpenALError("alGenSources"); + + // Generate OpenAL buffers for streaming + alGenBuffers(NUM_BUFFERS, buffers); + CheckOpenALError("alGenBuffers"); + + // Set source properties + alSourcef(source, AL_PITCH, 1.0f); + alSourcef(source, AL_GAIN, 1.0f); + alSource3f(source, AL_POSITION, 0.0f, 0.0f, 0.0f); + alSource3f(source, AL_VELOCITY, 0.0f, 0.0f, 0.0f); + alSourcei(source, AL_LOOPING, AL_FALSE); + CheckOpenALError("Source setup"); + + // Initialize audio buffer + audioBuffer.resize(bufferSize * channels); + + // Initialize buffer tracking + queuedBufferCount = 0; + while (!availableBuffers.empty()) { + availableBuffers.pop(); + } + + initialized = true; + return true; + } + + bool Start() override { + if (!initialized) { + std::cerr << "OpenAL audio output device not initialized" << std::endl; + return false; + } + + if (playing) { + return true; // Already playing + } + + playing = true; + + // Start an audio playback thread + audioThread = std::thread(&OpenALAudioOutputDevice::AudioThreadFunction, this); + + return true; + } + + bool Stop() override { + if (!playing) { + return true; // Already stopped + } + + playing = false; + + // Wait for the audio thread to finish + if (audioThread.joinable()) { + audioThread.join(); + } + + // Stop OpenAL source + if (initialized && source != 0) { + alSourceStop(source); + CheckOpenALError("alSourceStop"); + } + + return true; + } + + bool WriteAudio(const float* data, uint32_t sampleCount) override { + if (!initialized || !playing) { + return false; + } + + std::lock_guard lock(bufferMutex); + + // Add audio data to the queue + for (uint32_t i = 0; i < sampleCount * channels; i++) { + audioQueue.push(data[i]); + } + + return true; + } + + [[nodiscard]] bool IsPlaying() const override { + return playing; + } + + [[nodiscard]] uint32_t GetPosition() const override { + return playbackPosition; + } + + private: + static constexpr int NUM_BUFFERS = 8; + + uint32_t sampleRate = 44100; + uint32_t channels = 2; + uint32_t bufferSize = 1024; + bool initialized = false; + bool playing = false; + uint32_t playbackPosition = 0; + + // OpenAL objects + ALCdevice* device = nullptr; + ALCcontext* context = nullptr; + ALuint source = 0; + ALuint buffers[NUM_BUFFERS]{}; + int currentBuffer = 0; + + std::vector audioBuffer; + std::queue audioQueue; + std::mutex bufferMutex; + std::thread audioThread; + + // Buffer management for OpenAL streaming + std::queue availableBuffers; + int queuedBufferCount = 0; + + void Cleanup() { + if (initialized) { + // Clean up OpenAL resources + if (source != 0) { + alDeleteSources(1, &source); + source = 0; + } + + alDeleteBuffers(NUM_BUFFERS, buffers); + + if (context) { + alcMakeContextCurrent(nullptr); + alcDestroyContext(context); + context = nullptr; + } + + if (device) { + alcCloseDevice(device); + device = nullptr; + } + + // Reset buffer tracking + queuedBufferCount = 0; + while (!availableBuffers.empty()) { + availableBuffers.pop(); + } + + initialized = false; + } + } + + void AudioThreadFunction() { + // Calculate sleep time for audio buffer updates (in milliseconds) + const auto sleepTime = std::chrono::milliseconds( + static_cast((bufferSize * 1000) / sampleRate / 8) // Eighth buffer time for responsiveness + ); + + while (playing) { + ProcessAudioBuffer(); + std::this_thread::sleep_for(sleepTime); + } + } + + void ProcessAudioBuffer() { + std::lock_guard lock(bufferMutex); + + // Fill audio buffer from queue in whole stereo frames to preserve channel alignment + uint32_t samplesProcessed = 0; + const uint32_t framesAvailable = static_cast(audioQueue.size() / channels); + if (framesAvailable == 0) { + // Not enough data for a whole frame yet + return; + } + const uint32_t framesToSend = std::min(framesAvailable, bufferSize); + const uint32_t samplesToSend = framesToSend * channels; + for (uint32_t i = 0; i < samplesToSend; i++) { + audioBuffer[i] = audioQueue.front(); + audioQueue.pop(); + } + samplesProcessed = samplesToSend; + + if (samplesProcessed > 0) { + // Convert float samples to 16-bit PCM for OpenAL + std::vector pcmBuffer(samplesProcessed); + for (uint32_t i = 0; i < samplesProcessed; i++) { + // Clamp and convert to 16-bit PCM + float sample = std::clamp(audioBuffer[i], -1.0f, 1.0f); + pcmBuffer[i] = static_cast(sample * 32767.0f); + } + + // Check for processed buffers and unqueue them + ALint processed = 0; + alGetSourcei(source, AL_BUFFERS_PROCESSED, &processed); + CheckOpenALError("alGetSourcei AL_BUFFERS_PROCESSED"); + + // Unqueue processed buffers and add them to available buffers + while (processed > 0) { + ALuint buffer; + alSourceUnqueueBuffers(source, 1, &buffer); + CheckOpenALError("alSourceUnqueueBuffers"); + + // Add the unqueued buffer to available buffers + availableBuffers.push(buffer); + processed--; + } + + // Only proceed if we have an available buffer + ALuint buffer = 0; + if (!availableBuffers.empty()) { + buffer = availableBuffers.front(); + availableBuffers.pop(); + } else if (queuedBufferCount < NUM_BUFFERS) { + // Use a buffer that hasn't been queued yet + buffer = buffers[queuedBufferCount]; + } else { + // No available buffers, skip this frame + return; + } + + // Validate buffer parameters + if (pcmBuffer.empty()) { + // Re-add buffer to available list if we can't use it + if (queuedBufferCount >= NUM_BUFFERS) { + availableBuffers.push(buffer); + } + return; + } + + // Determine format based on channels + ALenum format = (channels == 1) ? AL_FORMAT_MONO16 : AL_FORMAT_STEREO16; + + // Upload audio data to OpenAL buffer + alBufferData(buffer, + format, + pcmBuffer.data(), + static_cast(samplesProcessed * sizeof(int16_t)), + static_cast(sampleRate)); + CheckOpenALError("alBufferData"); + + // Queue the buffer + alSourceQueueBuffers(source, 1, &buffer); + CheckOpenALError("alSourceQueueBuffers"); + + // Track that we've queued this buffer + if (queuedBufferCount < NUM_BUFFERS) { + queuedBufferCount++; + } + + // Start playing if not already playing + ALint sourceState; + alGetSourcei(source, AL_SOURCE_STATE, &sourceState); + CheckOpenALError("alGetSourcei AL_SOURCE_STATE"); + + if (sourceState != AL_PLAYING) { + alSourcePlay(source); + CheckOpenALError("alSourcePlay"); + } + + playbackPosition += samplesProcessed / channels; + } + } +}; + +#endif + +AudioSystem::~AudioSystem() { + // Stop the audio thread first + stopAudioThread(); + + // Stop and clean up audio output device + if (outputDevice) { + outputDevice->Stop(); + outputDevice.reset(); + } + + // Destructor implementation + sources.clear(); + audioData.clear(); + + // Clean up HRTF buffers + cleanupHRTFBuffers(); +} + +void AudioSystem::GenerateSineWavePing(float* buffer, uint32_t sampleCount, uint32_t playbackPosition) { + constexpr float sampleRate = 44100.0f; + const float frequency = 800.0f; // 800Hz ping + constexpr float pingDuration = 0.75f; // 0.75 second ping duration + constexpr auto pingSamples = static_cast(pingDuration * sampleRate); + constexpr float silenceDuration = 1.0f; // 1 second silence after ping + constexpr auto silenceSamples = static_cast(silenceDuration * sampleRate); + constexpr uint32_t totalCycleSamples = pingSamples + silenceSamples; + + const uint32_t attackSamples = static_cast(0.001f * sampleRate); // ~1ms attack + const uint32_t releaseSamples = static_cast(0.001f * sampleRate); // ~1ms release + constexpr float amplitude = 0.6f; + + for (uint32_t i = 0; i < sampleCount; i++) { + uint32_t globalPosition = playbackPosition + i; + uint32_t cyclePosition = globalPosition % totalCycleSamples; + + if (cyclePosition < pingSamples) { + float t = static_cast(cyclePosition) / sampleRate; + + // Minimal envelope for click prevention only + float envelope = 1.0f; + if (cyclePosition < attackSamples) { + envelope = static_cast(cyclePosition) / static_cast(std::max(1u, attackSamples)); + } else if (cyclePosition > pingSamples - releaseSamples) { + uint32_t relPos = pingSamples - cyclePosition; + envelope = static_cast(relPos) / static_cast(std::max(1u, releaseSamples)); + } + + float sineWave = sinf(2.0f * std::numbers::pi_v * frequency * t); + buffer[i] = amplitude * envelope * sineWave; + } else { + // Silence phase + buffer[i] = 0.0f; + } + } +} + +bool AudioSystem::Initialize(Engine* engine, Renderer* renderer) { + // Store the engine reference for accessing active camera + this->engine = engine; + + if (renderer) { + // Validate renderer if provided + if (!renderer->IsInitialized()) { + std::cerr << "AudioSystem::Initialize: Renderer is not initialized" << std::endl; + return false; + } + + // Store the renderer for compute shader support + this->renderer = renderer; + } else { + this->renderer = nullptr; + } + + // Generate default HRTF data for spatial audio processing + LoadHRTFData(""); // Pass empty filename to force generation of default HRTF data + + // Enable HRTF processing by default for 3D spatial audio + EnableHRTF(true); + + // Set default listener properties + SetListenerPosition(0.0f, 0.0f, 0.0f); + SetListenerOrientation(0.0f, 0.0f, -1.0f, 0.0f, 1.0f, 0.0f); + SetListenerVelocity(0.0f, 0.0f, 0.0f); + SetMasterVolume(1.0f); + + // Initialize audio output device +#if defined(PLATFORM_ANDROID) + outputDevice = std::make_unique(); +#else + outputDevice = std::make_unique(); +#endif + + if (!outputDevice->Initialize(44100, 2, 1024)) { + std::cerr << "Failed to initialize audio output device" << std::endl; + return false; + } + + // Start audio output + if (!outputDevice->Start()) { + std::cerr << "Failed to start audio output device" << std::endl; + return false; + } + + // Start the background audio processing thread + startAudioThread(); + + initialized = true; + return true; +} + +void AudioSystem::Update(std::chrono::milliseconds deltaTime) { + if (!initialized) { + return; + } + + // Synchronize HRTF listener position and orientation with active camera + if (engine) { + const CameraComponent* activeCamera = engine->GetActiveCamera(); + if (activeCamera) { + // Get camera position + glm::vec3 cameraPos = activeCamera->GetPosition(); + SetListenerPosition(cameraPos.x, cameraPos.y, cameraPos.z); + + // Calculate camera forward and up vectors for orientation + // The camera looks at its target, so forward = normalize(target - position) + glm::vec3 target = activeCamera->GetTarget(); + glm::vec3 up = activeCamera->GetUp(); + glm::vec3 forward = glm::normalize(target - cameraPos); + + SetListenerOrientation(forward.x, forward.y, forward.z, up.x, up.y, up.z); + } + } + + // Update audio sources and process spatial audio + for (auto& source : sources) { + if (!source->IsPlaying()) { + continue; + } + + // Cast to ConcreteAudioSource to access timing methods + auto* concreteSource = dynamic_cast(source.get()); + + // Update playback timing and delay logic + concreteSource->UpdatePlayback(deltaTime, 0); + + // Only process audio if not in the delay phase + if (!concreteSource->ShouldProcessAudio()) { + continue; + } + + // Process audio with HRTF spatial processing (works with or without renderer) + if (hrtfEnabled && !hrtfData.empty()) { + // Get source position for spatial processing + const float* sourcePosition = concreteSource->GetPosition(); + + // Accumulate samples based on real time and process in fixed-size chunks to avoid tiny buffers + double acc = concreteSource->GetSampleAccumulator(); + acc += (static_cast(deltaTime.count()) * 44100.0) / 1000.0; // ms -> samples + constexpr uint32_t kChunk = 33075; + uint32_t available = static_cast(acc); + if (available < kChunk) { + // Not enough for a full chunk; keep accumulating + concreteSource->SetSampleAccumulator(acc); + continue; + } + // Process as many full chunks as available this frame + while (available >= kChunk) { + std::vector inputBuffer(kChunk, 0.0f); + std::vector outputBuffer(kChunk * 2, 0.0f); + uint32_t actualSamplesProcessed = 0; + + // Generate audio signal from loaded audio data or debug ping + auto audioIt = audioData.find(concreteSource->GetName()); + if (audioIt != audioData.end() && !audioIt->second.empty()) { + // Use actual loaded audio data with proper position tracking + const auto& data = audioIt->second; + uint32_t playbackPos = concreteSource->GetPlaybackPosition(); + + for (uint32_t i = 0; i < kChunk; i++) { + uint32_t dataIndex = (playbackPos + i) * 4; // 4 bytes per sample (16-bit stereo) + + if (dataIndex + 1 < data.size()) { + // Convert from 16-bit PCM to float + int16_t sample = *reinterpret_cast(&data[dataIndex]); + inputBuffer[i] = static_cast(sample) / 32768.0f; + actualSamplesProcessed++; + } else { + // Reached end of audio data + inputBuffer[i] = 0.0f; + } + } + } else { + // Generate sine wave ping for debugging + GenerateSineWavePing(inputBuffer.data(), kChunk, concreteSource->GetPlaybackPosition()); + actualSamplesProcessed = kChunk; + } + + // Build extended input [history | current] to preserve convolution continuity across chunks + uint32_t histLen = (hrtfSize > 0) ? (hrtfSize - 1) : 0; + static std::unordered_map> hrtfHistories; + auto& hist = hrtfHistories[concreteSource]; + if (hist.size() != histLen) { + hist.assign(histLen, 0.0f); + } + std::vector extendedInput(histLen + kChunk, 0.0f); + if (histLen > 0) { + std::memcpy(extendedInput.data(), hist.data(), histLen * sizeof(float)); + } + std::memcpy(extendedInput.data() + histLen, inputBuffer.data(), kChunk * sizeof(float)); + + // Submit for GPU HRTF processing via the background thread (trim will occur in processAudioTask) + submitAudioTask(extendedInput.data(), static_cast(extendedInput.size()), sourcePosition, actualSamplesProcessed, histLen); + + // Update history with the tail of current input + if (histLen > 0) { + std::memcpy(hist.data(), inputBuffer.data() + (kChunk - histLen), histLen * sizeof(float)); + } + + // Update playback timing with actual samples processed + concreteSource->UpdatePlayback(std::chrono::milliseconds(0), actualSamplesProcessed); + + // Consume one chunk from the accumulator + acc -= static_cast(kChunk); + available -= kChunk; + } + // Store fractional remainder for next frame + concreteSource->SetSampleAccumulator(acc); + } + } + + // Apply master volume changes to all active sources + for (auto& source : sources) { + if (source->IsPlaying()) { + // Master volume is applied during HRTF processing and individual source volume control + // Volume scaling is handled in the ProcessHRTF function + } + } + + // Clean up finished audio sources + std::erase_if(sources, + [](const std::unique_ptr& source) { + // Keep all sources active for continuous playback + // Audio sources can be stopped/started via their Play/Stop methods + return false; + }); + + // Update timing for audio processing with low-latency chunks + static std::chrono::milliseconds accumulatedTime = std::chrono::milliseconds(0); + accumulatedTime += deltaTime; + + // Process audio in 20ms chunks for optimal latency + constexpr std::chrono::milliseconds audioChunkTime = std::chrono::milliseconds(20); // 20ms chunks for real-time audio + if (accumulatedTime >= audioChunkTime) { + // Trigger audio buffer updates for smooth playback + // The HRTF processing ensures spatial audio is updated continuously + accumulatedTime = std::chrono::milliseconds(0); + + // Update listener properties if they have changed + // This ensures spatial audio positioning stays current with camera movement + } +} + +bool AudioSystem::LoadAudio(const std::string& filename, const std::string& name) { + std::string resolvedPath = renderer->ResolvePath(filename); + // Open the WAV file + std::ifstream file(resolvedPath, std::ios::binary); + if (!file.is_open()) { + std::cerr << "Failed to open audio file: " << resolvedPath << " (original: " << filename << ")" << std::endl; + return false; + } + + // Read WAV header + struct WAVHeader { + char riff[4]; // "RIFF" + uint32_t fileSize; // File size - 8 + char wave[4]; // "WAVE" + char fmt[4]; // "fmt " + uint32_t fmtSize; // Format chunk size + uint16_t audioFormat; // Audio format (1 = PCM) + uint16_t numChannels; // Number of channels + uint32_t sampleRate; // Sample rate + uint32_t byteRate; // Byte rate + uint16_t blockAlign; // Block align + uint16_t bitsPerSample; // Bits per sample + char data[4]; // "data" + uint32_t dataSize; // Data size + }; + + WAVHeader header{}; + file.read(reinterpret_cast(&header), sizeof(WAVHeader)); + + // Validate WAV header + if (std::strncmp(header.riff, "RIFF", 4) != 0 || + std::strncmp(header.wave, "WAVE", 4) != 0 || + std::strncmp(header.fmt, "fmt ", 4) != 0 || + std::strncmp(header.data, "data", 4) != 0) { + std::cerr << "Invalid WAV file format: " << resolvedPath << " (original: " << filename << ")" << std::endl; + file.close(); + return false; + } + + // Only support PCM format for now + if (header.audioFormat != 1) { + std::cerr << "Unsupported audio format (only PCM supported): " << resolvedPath << " (original: " << filename << ")" << std::endl; + file.close(); + return false; + } + + // Read audio data + std::vector data(header.dataSize); + file.read(reinterpret_cast(data.data()), header.dataSize); + file.close(); + + if (file.gcount() != static_cast(header.dataSize)) { + std::cerr << "Failed to read complete audio data from: " << resolvedPath << " (original: " << filename << ")" << std::endl; + return false; + } + + // Store the audio data + audioData[name] = std::move(data); + + return true; +} + +AudioSource* AudioSystem::CreateAudioSource(const std::string& name) { + // Check if the audio data exists + auto it = audioData.find(name); + if (it == audioData.end()) { + std::cerr << "AudioSystem::CreateAudioSource: Audio data not found: " << name << std::endl; + return nullptr; + } + + // Create a new audio source + auto source = std::make_unique(name); + + // Calculate audio length in samples for timing + const auto& data = it->second; + if (!data.empty()) { + // Assuming 16-bit stereo audio at 44.1kHz (standard WAV format) + // The audio data reading uses dataIndex = (playbackPos + i) * 4 + // So we need to calculate length based on how many individual samples we can read + // Each 4 bytes represents one stereo sample pair, so total individual samples = data.size() / 4 + uint32_t totalSamples = static_cast(data.size()) / 4; + + // Set the audio length for proper timing + source->SetAudioLength(totalSamples); + } + + // Store the source + sources.push_back(std::move(source)); + + return sources.back().get(); +} + +AudioSource* AudioSystem::CreateDebugPingSource(const std::string& name) { + // Create a new audio source for debugging + auto source = std::make_unique(name); + + // Set up debug ping parameters + // The ping will cycle every 1.5 seconds (0.5s ping + 1.0s silence) + constexpr float sampleRate = 44100.0f; + constexpr float pingDuration = 0.5f; + constexpr float silenceDuration = 1.0f; + constexpr auto totalCycleSamples = static_cast((pingDuration + silenceDuration) * sampleRate); + + // For generated ping, let the generator control the 0.5s ping + 1.0s silence cycle. + // Disable source-level length/delay to avoid double-silence and audible resets. + source->SetAudioLength(0); + + // Store the source + sources.push_back(std::move(source)); + + return sources.back().get(); +} + +void AudioSystem::SetListenerPosition(const float x, const float y, const float z) { + listenerPosition[0] = x; + listenerPosition[1] = y; + listenerPosition[2] = z; +} + +void AudioSystem::SetListenerOrientation(const float forwardX, + const float forwardY, + const float forwardZ, + const float upX, + const float upY, + const float upZ) { + listenerOrientation[0] = forwardX; + listenerOrientation[1] = forwardY; + listenerOrientation[2] = forwardZ; + listenerOrientation[3] = upX; + listenerOrientation[4] = upY; + listenerOrientation[5] = upZ; +} + +void AudioSystem::SetListenerVelocity(const float x, const float y, const float z) { + listenerVelocity[0] = x; + listenerVelocity[1] = y; + listenerVelocity[2] = z; +} + +void AudioSystem::SetMasterVolume(const float volume) { + masterVolume = volume; +} + +void AudioSystem::EnableHRTF(const bool enable) { + hrtfEnabled = enable; +} + +bool AudioSystem::IsHRTFEnabled() const { + return hrtfEnabled; +} + +void AudioSystem::SetHRTFCPUOnly([[maybe_unused]] const bool cpuOnly) { + // Enforce GPU-only HRTF processing: ignore CPU-only requests + hrtfCPUOnly = false; +} + +bool AudioSystem::IsHRTFCPUOnly() const { + return hrtfCPUOnly; +} + +bool AudioSystem::LoadHRTFData(const std::string& filename) { + // HRTF parameters + constexpr uint32_t hrtfSampleCount = 256; // Number of samples per impulse response + constexpr uint32_t positionCount = 36 * 13; // 36 azimuths (10-degree steps) * 13 elevations (15-degree steps) + constexpr uint32_t channelCount = 2; // Stereo (left and right ears) + const float sampleRate = 44100.0f; // Sample rate for HRTF data + const float speedOfSound = 343.0f; // Speed of sound in m/s + const float headRadius = 0.0875f; // Average head radius in meters + + // Try to load from a file first (only if the filename is provided) + if (!filename.empty()) { + std::string resolvedPath = renderer->ResolvePath(filename); + if (std::ifstream file(resolvedPath, std::ios::binary); file.is_open()) { + // Read the file header to determine a format + char header[4]; + file.read(header, 4); + + if (std::strncmp(header, "HRTF", 4) == 0) { + // Custom HRTF format + uint32_t fileHrtfSize, filePositionCount, fileChannelCount; + file.read(reinterpret_cast(&fileHrtfSize), sizeof(uint32_t)); + file.read(reinterpret_cast(&filePositionCount), sizeof(uint32_t)); + file.read(reinterpret_cast(&fileChannelCount), sizeof(uint32_t)); + + if (fileChannelCount == channelCount) { + hrtfData.resize(fileHrtfSize * filePositionCount * fileChannelCount); + file.read(reinterpret_cast(hrtfData.data()), static_cast(hrtfData.size() * sizeof(float))); + + hrtfSize = fileHrtfSize; + numHrtfPositions = filePositionCount; + + file.close(); + return true; + } + } + file.close(); + } + } + + // Generate realistic HRTF data based on acoustic modeling + // Resize the HRTF data vector + hrtfData.resize(hrtfSampleCount * positionCount * channelCount); + + // Generate HRTF impulse responses for each position + for (uint32_t pos = 0; pos < positionCount; pos++) { + // Calculate azimuth and elevation for this position + uint32_t azimuthIndex = pos % 36; + uint32_t elevationIndex = pos / 36; + + float azimuth = (static_cast(azimuthIndex) * 10.0f - 180.0f) * std::numbers::pi_v / 180.0f; + float elevation = (static_cast(elevationIndex) * 15.0f - 90.0f) * std::numbers::pi_v / 180.0f; + + // Convert to Cartesian coordinates + float x = std::cos(elevation) * std::sin(azimuth); + float y = std::sin(elevation); + float z = std::cos(elevation) * std::cos(azimuth); + + for (uint32_t channel = 0; channel < channelCount; channel++) { + // Calculate ear position (left ear: -0.1m, right ear: +0.1m on x-axis) + float earX = (channel == 0) ? -0.1f : 0.1f; + + // Calculate distance from source to ear + float dx = x - earX; + float dy = y; + float dz = z; + float distance = std::sqrt(dx * dx + dy * dy + dz * dz); + + // Calculate time delay (ITD - Interaural Time Difference) + float timeDelay = distance / speedOfSound; + auto sampleDelay = static_cast(timeDelay * sampleRate); + + // Calculate head shadow effect (ILD - Interaural Level Difference) + float shadowFactor = 1.0f; + if (channel == 0 && azimuth > 0) { + // Left ear, source on right + shadowFactor = 0.3f + 0.7f * std::exp(-azimuth * 2.0f); + } else if (channel == 1 && azimuth < 0) { + // Right ear, source on left + shadowFactor = 0.3f + 0.7f * std::exp(azimuth * 2.0f); + } + + // Generate impulse response + uint32_t samplesGenerated = 0; + for (uint32_t i = 0; i < hrtfSampleCount; i++) { + float value = 0.0f; + + // Direct path impulse + if (i >= sampleDelay && i < sampleDelay + 10) { + float t = static_cast(i - sampleDelay) / sampleRate; + value = shadowFactor * std::exp(-t * 1000.0f) * std::cos(2.0f * std::numbers::pi_v * 1000.0f * t); + } + + // Apply distance attenuation + value /= std::max(1.0f, distance); + + uint32_t index = pos * hrtfSampleCount * channelCount + channel * hrtfSampleCount + i; + hrtfData[index] = value; + } + } + } + + // Store HRTF parameters + hrtfSize = hrtfSampleCount; + numHrtfPositions = positionCount; + + return true; +} + +bool AudioSystem::ProcessHRTF(const float* inputBuffer, float* outputBuffer, uint32_t sampleCount, const float* sourcePosition) { + if (!hrtfEnabled) { + // If HRTF is disabled, just copy input to output + for (uint32_t i = 0; i < sampleCount; i++) { + outputBuffer[i * 2] = inputBuffer[i]; // Left channel + outputBuffer[i * 2 + 1] = inputBuffer[i]; // Right channel + } + return true; + } + + // Check if we should use CPU-only processing or if Vulkan is not available + // Also force CPU processing if we've detected threading issues previously + static bool forceGPUFallback = false; + if (hrtfCPUOnly || !renderer || !renderer->IsInitialized() || forceGPUFallback) { + // Use CPU-based HRTF processing (either forced or fallback) + + // Create buffers for HRTF processing if they don't exist or if the sample count has changed + if (!createHRTFBuffers(sampleCount)) { + std::cerr << "Failed to create HRTF buffers" << std::endl; + return false; + } + + // Copy input data to input buffer + void* data = inputBufferMemory.mapMemory(0, sampleCount * sizeof(float)); + memcpy(data, inputBuffer, sampleCount * sizeof(float)); + inputBufferMemory.unmapMemory(); + + // Copy source and listener positions + memcpy(params.sourcePosition, sourcePosition, sizeof(float) * 3); + memcpy(params.listenerPosition, listenerPosition, sizeof(float) * 3); + memcpy(params.listenerOrientation, listenerOrientation, sizeof(float) * 6); + params.sampleCount = sampleCount; + params.hrtfSize = hrtfSize; + params.numHrtfPositions = numHrtfPositions; + params.padding = 0.0f; + + // Copy parameters to parameter buffer using persistent memory mapping + if (persistentParamsMemory) { + memcpy(persistentParamsMemory, ¶ms, sizeof(HRTFParams)); + } else { + std::cerr << "WARNING: Persistent memory not available, falling back to map/unmap" << std::endl; + data = paramsBufferMemory.mapMemory(0, sizeof(HRTFParams)); + memcpy(data, ¶ms, sizeof(HRTFParams)); + paramsBufferMemory.unmapMemory(); + } + + // Perform HRTF processing using CPU-based convolution + // This implementation provides real-time 3D audio spatialization + + // Calculate direction from listener to source + float direction[3]; + direction[0] = sourcePosition[0] - listenerPosition[0]; + direction[1] = sourcePosition[1] - listenerPosition[1]; + direction[2] = sourcePosition[2] - listenerPosition[2]; + + // Normalize direction + float length = std::sqrt(direction[0] * direction[0] + direction[1] * direction[1] + direction[2] * direction[2]); + if (length > 0.0001f) { + direction[0] /= length; + direction[1] /= length; + direction[2] /= length; + } else { + direction[0] = 0.0f; + direction[1] = 0.0f; + direction[2] = -1.0f; // Default to front + } + + // Calculate azimuth and elevation + float azimuth = std::atan2(direction[0], direction[2]); + float elevation = std::asin(std::max(-1.0f, std::min(1.0f, direction[1]))); + + // Convert to indices + int azimuthIndex = static_cast((azimuth + std::numbers::pi_v) / (2.0f * std::numbers::pi_v) * 36.0f) % 36; + int elevationIndex = static_cast((elevation + std::numbers::pi_v / 2.0f) / std::numbers::pi_v * 13.0f); + elevationIndex = std::max(0, std::min(12, elevationIndex)); + + // Get HRTF index + int hrtfIndex = elevationIndex * 36 + azimuthIndex; + hrtfIndex = std::min(hrtfIndex, static_cast(numHrtfPositions) - 1); + + // Perform convolution for left and right ears with simple overlap-add using per-direction input history + static std::unordered_map> convHistories; // mono histories keyed by hrtfIndex + const uint32_t histLenDesired = (hrtfSize > 0) ? (hrtfSize - 1) : 0; + auto& convHistory = convHistories[hrtfIndex]; + if (convHistory.size() != histLenDesired) { + convHistory.assign(histLenDesired, 0.0f); + } + + // Build extended input: [history | current input] + std::vector extInput(histLenDesired + sampleCount, 0.0f); + if (histLenDesired > 0) { + std::memcpy(extInput.data(), convHistory.data(), histLenDesired * sizeof(float)); + } + if (sampleCount > 0) { + std::memcpy(extInput.data() + histLenDesired, inputBuffer, sampleCount * sizeof(float)); + } + + for (uint32_t i = 0; i < sampleCount; i++) { + float leftSample = 0.0f; + float rightSample = 0.0f; + + // Convolve with HRTF impulse response using extended input + // extIndex = histLenDesired + i - j; ensure extIndex >= 0 + uint32_t jMax = std::min(hrtfSize - 1, histLenDesired + i); + for (uint32_t j = 0; j <= jMax; j++) { + uint32_t extIndex = histLenDesired + i - j; + uint32_t hrtfLeftIndex = hrtfIndex * hrtfSize * 2 + j; + uint32_t hrtfRightIndex = hrtfIndex * hrtfSize * 2 + hrtfSize + j; + + if (hrtfLeftIndex < hrtfData.size() && hrtfRightIndex < hrtfData.size()) { + float in = extInput[extIndex]; + leftSample += in * hrtfData[hrtfLeftIndex]; + rightSample += in * hrtfData[hrtfRightIndex]; + } + } + + // Apply distance attenuation + float distanceAttenuation = 1.0f / std::max(1.0f, length); + leftSample *= distanceAttenuation; + rightSample *= distanceAttenuation; + + // Write to output buffer + outputBuffer[i * 2] = leftSample; + outputBuffer[i * 2 + 1] = rightSample; + } + + // Update history with the tail of the extended input + if (histLenDesired > 0) { + std::memcpy(convHistory.data(), extInput.data() + sampleCount, histLenDesired * sizeof(float)); + } + + return true; + } else { + // Use Vulkan shader-based HRTF processing with fallback to CPU + try { + // Validate HRTF data exists + if (hrtfData.empty()) { + LoadHRTFData(""); // Generate HRTF data + } + + // Create buffers for HRTF processing if they don't exist or if the sample count has changed + if (!createHRTFBuffers(sampleCount)) { + std::cerr << "Failed to create HRTF buffers, falling back to CPU processing" << std::endl; + throw std::runtime_error("Buffer creation failed"); + } + + // Copy input data to input buffer + void* data = inputBufferMemory.mapMemory(0, sampleCount * sizeof(float)); + memcpy(data, inputBuffer, sampleCount * sizeof(float)); + + inputBufferMemory.unmapMemory(); + + // Set up HRTF parameters with proper std140 uniform buffer layout + struct alignas(16) HRTFParams { + float listenerPosition[4]; // vec3 + padding (16 bytes) - offset 0 + float listenerForward[4]; // vec3 + padding (16 bytes) - offset 16 + float listenerUp[4]; // vec3 + padding (16 bytes) - offset 32 + float sourcePosition[4]; // vec3 + padding (16 bytes) - offset 48 + float sampleCount; // float (4 bytes) - offset 64 + float padding1[3]; // Padding to align to 16-byte boundary - offset 68 + uint32_t inputChannels; // uint (4 bytes) - offset 80 + uint32_t outputChannels; // uint (4 bytes) - offset 84 + uint32_t hrtfSize; // uint (4 bytes) - offset 88 + uint32_t numHrtfPositions; // uint (4 bytes) - offset 92 + float distanceAttenuation; // float (4 bytes) - offset 96 + float dopplerFactor; // float (4 bytes) - offset 100 + float reverbMix; // float (4 bytes) - offset 104 + float padding2; // Padding to complete 16-byte alignment - offset 108 + } params{}; + + // Copy listener and source positions with proper padding for GPU alignment + memcpy(params.listenerPosition, listenerPosition, sizeof(float) * 3); + params.listenerPosition[3] = 0.0f; // Padding for float3 alignment + memcpy(params.listenerForward, &listenerOrientation[0], sizeof(float) * 3); // Forward vector + params.listenerForward[3] = 0.0f; // Padding for float3 alignment + memcpy(params.listenerUp, &listenerOrientation[3], sizeof(float) * 3); // Up vector + params.listenerUp[3] = 0.0f; // Padding for float3 alignment + memcpy(params.sourcePosition, sourcePosition, sizeof(float) * 3); + params.sourcePosition[3] = 0.0f; // Padding for float3 alignment + params.sampleCount = static_cast(sampleCount); // Number of samples to process + params.padding1[0] = params.padding1[1] = params.padding1[2] = 0.0f; // Initialize padding + params.inputChannels = 1; // Mono input + params.outputChannels = 2; // Stereo output + params.hrtfSize = hrtfSize; + params.numHrtfPositions = numHrtfPositions; + params.distanceAttenuation = 1.0f; + params.dopplerFactor = 1.0f; + params.reverbMix = 0.0f; + params.padding2 = 0.0f; // Initialize padding + + // Copy parameters to parameter buffer using persistent memory mapping + if (persistentParamsMemory) { + memcpy(persistentParamsMemory, ¶ms, sizeof(HRTFParams)); + } else { + std::cerr << "ERROR: Persistent memory not available for GPU processing!" << std::endl; + throw std::runtime_error("Persistent memory required for GPU processing"); + } + + // Use renderer's main compute pipeline instead of dedicated HRTF pipeline + uint32_t workGroupSize = 64; // Must match the numthreads in the shader + uint32_t groupCountX = (sampleCount + workGroupSize - 1) / workGroupSize; + + // Use renderer's main compute pipeline dispatch method + auto computeFence = renderer->DispatchCompute(groupCountX, + 1, + 1, + *this->inputBuffer, + *this->outputBuffer, + *this->hrtfBuffer, + *this->paramsBuffer); + + // Wait for compute shader to complete using fence-based synchronization + const vk::raii::Device& device = renderer->GetRaiiDevice(); + vk::Result result = device.waitForFences(*computeFence, VK_TRUE, UINT64_MAX); + if (result != vk::Result::eSuccess) { + std::cerr << "Failed to wait for compute fence: " << vk::to_string(result) << std::endl; + throw std::runtime_error("Fence wait failed"); + } + + // Copy results from output buffer to the output array + void* outputData = outputBufferMemory.mapMemory(0, sampleCount * 2 * sizeof(float)); + + memcpy(outputBuffer, outputData, sampleCount * 2 * sizeof(float)); + outputBufferMemory.unmapMemory(); + + return true; + } catch (const std::exception& e) { + std::cerr << "GPU HRTF processing failed: " << e.what() << std::endl; + std::cerr << "CPU fallback disabled - GPU path required" << std::endl; + throw; // Re-throw the exception to ensure failure without CPU fallback + } + } +} + +bool AudioSystem::createHRTFBuffers(uint32_t sampleCount) { + // Smart buffer reuse: only recreate if sample count changed significantly or buffers don't exist + if (currentSampleCount == sampleCount && *inputBuffer && *outputBuffer && *hrtfBuffer && *paramsBuffer) { + return true; + } + + // Ensure all GPU operations complete before cleaning up existing buffers. + // External synchronization required (VVL): use renderer helper which serializes against queue usage. + if (renderer) { + renderer->WaitIdle(); + } + + // Clean up existing buffers only if we need to recreate them + cleanupHRTFBuffers(); + + if (!renderer) { + std::cerr << "AudioSystem::createHRTFBuffers: Renderer is null" << std::endl; + return false; + } + + const vk::raii::Device& device = renderer->GetRaiiDevice(); + try { + // Create input buffer (mono audio) + vk::BufferCreateInfo inputBufferInfo; + inputBufferInfo.size = sampleCount * sizeof(float); + inputBufferInfo.usage = vk::BufferUsageFlagBits::eStorageBuffer; + inputBufferInfo.sharingMode = vk::SharingMode::eExclusive; + + inputBuffer = vk::raii::Buffer(device, inputBufferInfo); + + vk::MemoryRequirements inputMemRequirements = inputBuffer.getMemoryRequirements(); + + vk::MemoryAllocateInfo inputAllocInfo; + inputAllocInfo.allocationSize = inputMemRequirements.size; + inputAllocInfo.memoryTypeIndex = renderer->FindMemoryType( + inputMemRequirements.memoryTypeBits, + vk::MemoryPropertyFlagBits::eHostVisible | vk::MemoryPropertyFlagBits::eHostCoherent); + + inputBufferMemory = vk::raii::DeviceMemory(device, inputAllocInfo); + inputBuffer.bindMemory(*inputBufferMemory, 0); + + // Create output buffer (stereo audio) + vk::BufferCreateInfo outputBufferInfo; + outputBufferInfo.size = sampleCount * 2 * sizeof(float); // Stereo (2 channels) + outputBufferInfo.usage = vk::BufferUsageFlagBits::eStorageBuffer; + outputBufferInfo.sharingMode = vk::SharingMode::eExclusive; + + outputBuffer = vk::raii::Buffer(device, outputBufferInfo); + + vk::MemoryRequirements outputMemRequirements = outputBuffer.getMemoryRequirements(); + + vk::MemoryAllocateInfo outputAllocInfo; + outputAllocInfo.allocationSize = outputMemRequirements.size; + outputAllocInfo.memoryTypeIndex = renderer->FindMemoryType( + outputMemRequirements.memoryTypeBits, + vk::MemoryPropertyFlagBits::eHostVisible | vk::MemoryPropertyFlagBits::eHostCoherent); + + outputBufferMemory = vk::raii::DeviceMemory(device, outputAllocInfo); + outputBuffer.bindMemory(*outputBufferMemory, 0); + + // Create HRTF data buffer + vk::BufferCreateInfo hrtfBufferInfo; + hrtfBufferInfo.size = hrtfData.size() * sizeof(float); + hrtfBufferInfo.usage = vk::BufferUsageFlagBits::eStorageBuffer; + hrtfBufferInfo.sharingMode = vk::SharingMode::eExclusive; + + hrtfBuffer = vk::raii::Buffer(device, hrtfBufferInfo); + + vk::MemoryRequirements hrtfMemRequirements = hrtfBuffer.getMemoryRequirements(); + + vk::MemoryAllocateInfo hrtfAllocInfo; + hrtfAllocInfo.allocationSize = hrtfMemRequirements.size; + hrtfAllocInfo.memoryTypeIndex = renderer->FindMemoryType( + hrtfMemRequirements.memoryTypeBits, + vk::MemoryPropertyFlagBits::eHostVisible | vk::MemoryPropertyFlagBits::eHostCoherent); + + hrtfBufferMemory = vk::raii::DeviceMemory(device, hrtfAllocInfo); + hrtfBuffer.bindMemory(*hrtfBufferMemory, 0); + + // Copy HRTF data to buffer + void* hrtfMappedMemory = hrtfBufferMemory.mapMemory(0, hrtfData.size() * sizeof(float)); + memcpy(hrtfMappedMemory, hrtfData.data(), hrtfData.size() * sizeof(float)); + hrtfBufferMemory.unmapMemory(); + + // Create parameters buffer - use the correct GPU structure size + // The GPU processing uses a larger aligned structure (112 bytes) not the header struct (64 bytes) + struct alignas(16) GPUHRTFParams { + float listenerPosition[4]; // vec3 + padding (16 bytes) + float listenerForward[4]; // vec3 + padding (16 bytes) + float listenerUp[4]; // vec3 + padding (16 bytes) + float sourcePosition[4]; // vec3 + padding (16 bytes) + float sampleCount; // float (4 bytes) + float padding1[3]; // Padding to align to 16-byte boundary + uint32_t inputChannels; // uint (4 bytes) + uint32_t outputChannels; // uint (4 bytes) + uint32_t hrtfSize; // uint (4 bytes) + uint32_t numHrtfPositions; // uint (4 bytes) + float distanceAttenuation; // float (4 bytes) + float dopplerFactor; // float (4 bytes) + float reverbMix; // float (4 bytes) + float padding2; // Padding to complete 16-byte alignment + }; + + vk::BufferCreateInfo paramsBufferInfo; + paramsBufferInfo.size = sizeof(GPUHRTFParams); // Use correct GPU structure size (112 bytes) + paramsBufferInfo.usage = vk::BufferUsageFlagBits::eUniformBuffer; + paramsBufferInfo.sharingMode = vk::SharingMode::eExclusive; + + paramsBuffer = vk::raii::Buffer(device, paramsBufferInfo); + + vk::MemoryRequirements paramsMemRequirements = paramsBuffer.getMemoryRequirements(); + + vk::MemoryAllocateInfo paramsAllocInfo; + paramsAllocInfo.allocationSize = paramsMemRequirements.size; + paramsAllocInfo.memoryTypeIndex = renderer->FindMemoryType( + paramsMemRequirements.memoryTypeBits, + vk::MemoryPropertyFlagBits::eHostVisible | vk::MemoryPropertyFlagBits::eHostCoherent); + + paramsBufferMemory = vk::raii::DeviceMemory(device, paramsAllocInfo); + paramsBuffer.bindMemory(*paramsBufferMemory, 0); + + // Set up persistent memory mapping for parameters buffer to avoid repeated map/unmap operations + persistentParamsMemory = paramsBufferMemory.mapMemory(0, sizeof(GPUHRTFParams)); + // Update current sample count to track buffer size + currentSampleCount = sampleCount; + return true; + } catch (const std::exception& e) { + std::cerr << "Error creating HRTF buffers: " << e.what() << std::endl; + cleanupHRTFBuffers(); + return false; + } +} + +void AudioSystem::cleanupHRTFBuffers() { + // Unmap persistent memory if it exists + if (persistentParamsMemory && *paramsBufferMemory) { + paramsBufferMemory.unmapMemory(); + persistentParamsMemory = nullptr; + } + + // With RAII, we just need to set the resources to nullptr + // The destructors will handle the cleanup + inputBuffer = nullptr; + inputBufferMemory = nullptr; + outputBuffer = nullptr; + outputBufferMemory = nullptr; + hrtfBuffer = nullptr; + hrtfBufferMemory = nullptr; + paramsBuffer = nullptr; + paramsBufferMemory = nullptr; + + // Reset sample count tracking + currentSampleCount = 0; +} + +// Threading implementation methods + +void AudioSystem::startAudioThread() { + if (audioThreadRunning.load()) { + return; // Thread already running + } + + audioThreadShouldStop.store(false); + audioThreadRunning.store(true); + + audioThread = std::thread(&AudioSystem::audioThreadLoop, this); +} + +void AudioSystem::stopAudioThread() { + if (!audioThreadRunning.load()) { + return; // Thread not running + } + + // Signal the thread to stop + audioThreadShouldStop.store(true); + + // Wake up the thread if it's waiting + audioCondition.notify_all(); + + // Wait for the thread to finish + if (audioThread.joinable()) { + audioThread.join(); + } + + audioThreadRunning.store(false); +} + +void AudioSystem::audioThreadLoop() { + while (!audioThreadShouldStop.load()) { + std::shared_ptr task = nullptr; + + // Wait for a task or stop signal + { + std::unique_lock lock(taskQueueMutex); + audioCondition.wait(lock, + [this] { + return !audioTaskQueue.empty() || audioThreadShouldStop.load(); + }); + + if (audioThreadShouldStop.load()) { + break; + } + + if (!audioTaskQueue.empty()) { + task = audioTaskQueue.front(); + audioTaskQueue.pop(); + } + } + + // Process the task if we have one + if (task) { + processAudioTask(task); + } + } +} + +void AudioSystem::processAudioTask(const std::shared_ptr& task) { + // Process HRTF in the background thread + bool success = ProcessHRTF(task->inputBuffer.data(), + task->outputBuffer.data(), + task->sampleCount, + task->sourcePosition); + + if (success && task->outputDevice && task->outputDevice->IsPlaying()) { + // We used extended input of length sampleCount = histLen + outFrames. + // Trim the first trimFront frames from the stereo output and only write actualSamplesProcessed frames. + uint32_t startFrame = task->trimFront; + uint32_t framesToWrite = task->actualSamplesProcessed; + if (startFrame * 2 > task->outputBuffer.size()) { + startFrame = 0; // safety + } + if (startFrame * 2 + framesToWrite * 2 > task->outputBuffer.size()) { + framesToWrite = static_cast((task->outputBuffer.size() / 2) - startFrame); + } + float* startPtr = task->outputBuffer.data() + startFrame * 2; + // Apply master volume only to the range we will write + for (uint32_t i = 0; i < framesToWrite * 2; i++) { + startPtr[i] *= task->masterVolume; + } + // Send processed audio directly to output device from background thread + if (!task->outputDevice->WriteAudio(startPtr, framesToWrite)) { + std::cerr << "Failed to write audio data to output device from background thread" << std::endl; + } + } +} + +bool AudioSystem::submitAudioTask(const float* inputBuffer, + uint32_t sampleCount, + const float* sourcePosition, + uint32_t actualSamplesProcessed, + uint32_t trimFront) { + if (!audioThreadRunning.load()) { + // Fallback to synchronous processing if the thread is not running + std::vector outputBuffer(sampleCount * 2); + bool success = ProcessHRTF(inputBuffer, outputBuffer.data(), sampleCount, sourcePosition); + + if (success && outputDevice && outputDevice->IsPlaying()) { + // Apply master volume + for (uint32_t i = 0; i < sampleCount * 2; i++) { + outputBuffer[i] *= masterVolume; + } + + // Send to audio output device + if (!outputDevice->WriteAudio(outputBuffer.data(), sampleCount)) { + std::cerr << "Failed to write audio data to output device" << std::endl; + return false; + } + } + return success; + } + + // Create a new task for asynchronous processing + auto task = std::make_shared(); + task->inputBuffer.assign(inputBuffer, inputBuffer + sampleCount); + task->outputBuffer.resize(sampleCount * 2); // Stereo output + memcpy(task->sourcePosition, sourcePosition, sizeof(float) * 3); + task->sampleCount = sampleCount; // includes history frames + task->actualSamplesProcessed = actualSamplesProcessed; // new frames only (kChunk) + task->trimFront = sampleCount - actualSamplesProcessed; // history length (histLen) + task->outputDevice = outputDevice.get(); + task->masterVolume = masterVolume; + + // Submit the task to the queue (non-blocking) + { + std::lock_guard lock(taskQueueMutex); + audioTaskQueue.push(task); + } + audioCondition.notify_one(); + + return true; // Return immediately without waiting +} + +void AudioSystem::FlushOutput() { + // Stop background processing to avoid races while flushing + stopAudioThread(); + + // Clear any pending audio processing tasks + { + std::lock_guard lock(taskQueueMutex); + std::queue> empty; + std::swap(audioTaskQueue, empty); + } + + // Flush the output device buffers and queues by restart + if (outputDevice) { + outputDevice->Stop(); + outputDevice->Start(); + } + + // Restart background processing + startAudioThread(); +} diff --git a/attachments/sync2_engine/audio_system.h b/attachments/sync2_engine/audio_system.h new file mode 100644 index 00000000..9ddb365d --- /dev/null +++ b/attachments/sync2_engine/audio_system.h @@ -0,0 +1,429 @@ +/* Copyright (c) 2025 Holochip Corporation + * + * SPDX-License-Identifier: Apache-2.0 + * + * Licensed under the Apache License, Version 2.0 the "License"; + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +#pragma once + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +/** + * @brief Class representing an audio source. + */ +class AudioSource { + public: + /** + * @brief Default constructor. + */ + AudioSource() = default; + + /** + * @brief Destructor for proper cleanup. + */ + virtual ~AudioSource() = default; + + /** + * @brief Play the audio source. + */ + virtual void Play() = 0; + + /** + * @brief Pause the audio source. + */ + virtual void Pause() = 0; + + /** + * @brief Stop the audio source. + */ + virtual void Stop() = 0; + + /** + * @brief Set the volume of the audio source. + * @param volume The volume (0.0f to 1.0f). + */ + virtual void SetVolume(float volume) = 0; + + /** + * @brief Set whether the audio source should loop. + * @param loop Whether to loop. + */ + virtual void SetLoop(bool loop) = 0; + + /** + * @brief Set the position of the audio source in 3D space. + * @param x The x-coordinate. + * @param y The y-coordinate. + * @param z The z-coordinate. + */ + virtual void SetPosition(float x, float y, float z) = 0; + + /** + * @brief Set the velocity of the audio source in 3D space. + * @param x The x-component. + * @param y The y-component. + * @param z The z-component. + */ + virtual void SetVelocity(float x, float y, float z) = 0; + + /** + * @brief Check if the audio source is playing. + * @return True if playing, false otherwise. + */ + virtual bool IsPlaying() const = 0; +}; + +// Forward declarations +class Renderer; +class Engine; + +/** + * @brief Interface for audio output devices. + */ +class AudioOutputDevice { + public: + /** + * @brief Default constructor. + */ + AudioOutputDevice() = default; + + /** + * @brief Virtual destructor for proper cleanup. + */ + virtual ~AudioOutputDevice() = default; + + /** + * @brief Initialize the audio output device. + * @param sampleRate The sample rate (e.g., 44100). + * @param channels The number of channels (typically 2 for stereo). + * @param bufferSize The buffer size in samples. + * @return True if initialization was successful, false otherwise. + */ + virtual bool Initialize(uint32_t sampleRate, uint32_t channels, uint32_t bufferSize) = 0; + + /** + * @brief Start audio playback. + * @return True if successful, false otherwise. + */ + virtual bool Start() = 0; + + /** + * @brief Stop audio playback. + * @return True if successful, false otherwise. + */ + virtual bool Stop() = 0; + + /** + * @brief Write audio data to the output device. + * @param data Pointer to the audio data (interleaved stereo float samples). + * @param sampleCount Number of samples per channel to write. + * @return True if successful, false otherwise. + */ + virtual bool WriteAudio(const float* data, uint32_t sampleCount) = 0; + + /** + * @brief Check if the device is currently playing. + * @return True if playing, false otherwise. + */ + virtual bool IsPlaying() const = 0; + + /** + * @brief Get the current playback position in samples. + * @return Current position in samples. + */ + virtual uint32_t GetPosition() const = 0; +}; + +/** + * @brief Class for managing audio. + */ +class AudioSystem { + public: + /** + * @brief Default constructor. + */ + AudioSystem() = default; + + // Constructor-based initialization to replace separate Initialize() calls + AudioSystem(Engine* engine, Renderer* renderer) { + if (!Initialize(engine, renderer)) { + throw std::runtime_error("AudioSystem: initialization failed"); + } + } + + /** + * @brief Flush audio output: clears pending processing and device buffers so playback restarts cleanly. + */ + void FlushOutput(); + + /** + * @brief Destructor for proper cleanup. + */ + ~AudioSystem(); + + /** + * @brief Update the audio system. + * @param deltaTime The time elapsed since the last update. + */ + void Update(std::chrono::milliseconds deltaTime); + + /** + * @brief Load an audio file. + * @param filename The path to the audio file. + * @param name The name to assign to the audio. + * @return True if loading was successful, false otherwise. + */ + bool LoadAudio(const std::string& filename, const std::string& name); + + /** + * @brief Create an audio source. + * @param name The name of the audio to use. + * @return Pointer to the created audio source, or nullptr if creation failed. + */ + AudioSource* CreateAudioSource(const std::string& name); + + /** + * @brief Create a sine wave ping audio source for debugging. + * @param name The name to assign to the debug audio source. + * @return Pointer to the created audio source, or nullptr if creation failed. + */ + AudioSource* CreateDebugPingSource(const std::string& name); + + /** + * @brief Set the listener position in 3D space. + * @param x The x-coordinate. + * @param y The y-coordinate. + * @param z The z-coordinate. + */ + void SetListenerPosition(float x, float y, float z); + + /** + * @brief Set the listener orientation in 3D space. + * @param forwardX The x-component of the forward vector. + * @param forwardY The y-component of the forward vector. + * @param forwardZ The z-component of the forward vector. + * @param upX The x-component of the up vector. + * @param upY The y-component of the up vector. + * @param upZ The z-component of the up vector. + */ + void SetListenerOrientation(float forwardX, + float forwardY, + float forwardZ, + float upX, + float upY, + float upZ); + + /** + * @brief Set the listener velocity in 3D space. + * @param x The x-component. + * @param y The y-component. + * @param z The z-component. + */ + void SetListenerVelocity(float x, float y, float z); + + /** + * @brief Set the master volume. + * @param volume The volume (0.0f to 1.0f). + */ + void SetMasterVolume(float volume); + + /** + * @brief Enable HRTF (Head-Related Transfer Function) processing. + * @param enable Whether to enable HRTF processing. + */ + void EnableHRTF(bool enable); + + /** + * @brief Check if HRTF processing is enabled. + * @return True if HRTF processing is enabled, false otherwise. + */ + bool IsHRTFEnabled() const; + + /** + * @brief Set whether to force CPU-only HRTF processing. + * @param cpuOnly Whether to force CPU-only processing (true) or allow Vulkan shader processing (false). + */ + void SetHRTFCPUOnly(bool cpuOnly); + + /** + * @brief Check if HRTF processing is set to CPU-only mode. + * @return True if CPU-only mode is enabled, false if Vulkan shader processing is allowed. + */ + bool IsHRTFCPUOnly() const; + + /** + * @brief Load HRTF data from a file. + * @param filename The path to the HRTF data file. + * @return True if loading was successful, false otherwise. + */ + bool LoadHRTFData(const std::string& filename); + + /** + * @brief Process audio data with HRTF. + * @param inputBuffer The input audio buffer. + * @param outputBuffer The output audio buffer. + * @param sampleCount The number of samples to process. + * @param sourcePosition The position of the sound source. + * @return True if processing was successful, false otherwise. + */ + bool ProcessHRTF(const float* inputBuffer, float* outputBuffer, uint32_t sampleCount, const float* sourcePosition); + + /** + * @brief Generate a sine wave ping for debugging purposes. + * @param buffer The output buffer to fill with ping audio data. + * @param sampleCount The number of samples to generate. + * @param playbackPosition The current playback position for timing. + */ + static void GenerateSineWavePing(float* buffer, uint32_t sampleCount, uint32_t playbackPosition); + + private: + /** + * @brief Initialize the audio system (called by constructor). + * @param engine Pointer to the engine for accessing active camera. + * @param renderer Pointer to the renderer for compute shader support. + * @return True if initialization was successful, false otherwise. + */ + bool Initialize(Engine* engine, Renderer* renderer = nullptr); + + // Loaded audio data + std::unordered_map> audioData; + + // Audio sources + std::vector> sources; + + // Listener properties + float listenerPosition[3] = {0.0f, 0.0f, 0.0f}; + float listenerOrientation[6] = {0.0f, 0.0f, -1.0f, 0.0f, 1.0f, 0.0f}; + float listenerVelocity[3] = {0.0f, 0.0f, 0.0f}; + + // Master volume + float masterVolume = 1.0f; + + // Whether the audio system is initialized + bool initialized = false; + + // HRTF processing + bool hrtfEnabled = false; + bool hrtfCPUOnly = false; + std::vector hrtfData; + uint32_t hrtfSize = 0; + uint32_t numHrtfPositions = 0; + + // Renderer for compute shader support + Renderer* renderer = nullptr; + + // Engine reference for accessing active camera + Engine* engine = nullptr; + + // Audio output device for sending processed audio to speakers + std::unique_ptr outputDevice = nullptr; + + // Threading infrastructure for background audio processing + std::thread audioThread; + std::mutex audioMutex; + std::condition_variable audioCondition; + std::atomic audioThreadRunning{false}; + std::atomic audioThreadShouldStop{false}; + + // Audio processing task queue + struct AudioTask { + std::vector inputBuffer; + std::vector outputBuffer; + float sourcePosition[3]; + uint32_t sampleCount; // total frames in input/output (may include history) + uint32_t actualSamplesProcessed; // frames to write this tick (new part) + uint32_t trimFront; // frames to skip from output front (history length) + AudioOutputDevice* outputDevice; + float masterVolume; + }; + // Set up HRTF parameters + struct HRTFParams { + float sourcePosition[3]; + float listenerPosition[3]; + float listenerOrientation[6]; // Forward (3) and up (3) vectors + uint32_t sampleCount; + uint32_t hrtfSize; + uint32_t numHrtfPositions; + float padding; // For alignment + } params; + std::queue> audioTaskQueue; + std::mutex taskQueueMutex; + + // Vulkan resources for HRTF processing + vk::raii::Buffer inputBuffer = nullptr; + vk::raii::DeviceMemory inputBufferMemory = nullptr; + vk::raii::Buffer outputBuffer = nullptr; + vk::raii::DeviceMemory outputBufferMemory = nullptr; + vk::raii::Buffer hrtfBuffer = nullptr; + vk::raii::DeviceMemory hrtfBufferMemory = nullptr; + vk::raii::Buffer paramsBuffer = nullptr; + vk::raii::DeviceMemory paramsBufferMemory = nullptr; + + // Persistent memory mapping for UBO to avoid repeated map/unmap operations + void* persistentParamsMemory = nullptr; + uint32_t currentSampleCount = 0; // Track current buffer size to avoid unnecessary recreation + + /** + * @brief Create buffers for HRTF processing. + * @param sampleCount The number of samples to process. + * @return True if creation was successful, false otherwise. + */ + bool createHRTFBuffers(uint32_t sampleCount); + + /** + * @brief Clean up HRTF buffers. + */ + void cleanupHRTFBuffers(); + + /** + * @brief Start the background audio processing thread. + */ + void startAudioThread(); + + /** + * @brief Stop the background audio processing thread. + */ + void stopAudioThread(); + + /** + * @brief Main loop for the background audio processing thread. + */ + void audioThreadLoop(); + + /** + * @brief Process an audio task in the background thread. + * @param task The audio task to process. + */ + void processAudioTask(const std::shared_ptr& task); + + /** + * @brief Submit an audio processing task to the background thread. + * @param inputBuffer The input audio buffer. + * @param sampleCount The number of samples to process. + * @param sourcePosition The position of the sound source. + * @param actualSamplesProcessed The number of samples actually processed. + * @return True if the task was submitted successfully, false otherwise. + */ + bool submitAudioTask(const float* inputBuffer, uint32_t sampleCount, const float* sourcePosition, uint32_t actualSamplesProcessed, uint32_t trimFront); +}; diff --git a/attachments/sync2_engine/engine.cpp b/attachments/sync2_engine/engine.cpp new file mode 100644 index 00000000..b1301c2a --- /dev/null +++ b/attachments/sync2_engine/engine.cpp @@ -0,0 +1,1036 @@ +/* Copyright (c) 2025 Holochip Corporation + * + * SPDX-License-Identifier: Apache-2.0 + * + * Licensed under the Apache License, Version 2.0 the "License"; + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +#include "engine.h" +#include "mesh_component.h" +#include "scene_loading.h" + +#include +#include +#include +#include +#include +#include + +// This implementation corresponds to the Engine_Architecture chapter in the tutorial: +// @see en/Building_a_Simple_Engine/Engine_Architecture/02_architectural_patterns.adoc + +Engine::Engine() : resourceManager(std::make_unique()) { +} + +bool Engine::IsMainThread() const { + return std::this_thread::get_id() == mainThreadId; +} + +void Engine::ProcessPendingEntityRemovals() { + std::vector names; { + std::lock_guard lk(pendingEntityRemovalsMutex); + if (pendingEntityRemovalNames.empty()) + return; + names.swap(pendingEntityRemovalNames); + } + + // Process on the main thread only (safety) + if (!IsMainThread()) { + // Put them back; we'll retry next main-thread tick + std::lock_guard lk(pendingEntityRemovalsMutex); + pendingEntityRemovalNames.insert(pendingEntityRemovalNames.end(), names.begin(), names.end()); + return; + } + + // Apply removals using the normal API (which takes the appropriate locks). + for (const auto& name : names) { + (void) RemoveEntity(name); + } +} + +Engine::~Engine() { + Cleanup(); +} + +bool Engine::Initialize(const std::string& appName, int width, int height, bool enableValidationLayers, bool debugSync) { + // Create platform +#if defined(PLATFORM_ANDROID) + // For Android, the platform is created with the android_app + // This will be handled in the android_main function + return false; +#else + // Record main thread identity for deferring destructive operations from background threads + mainThreadId = std::this_thread::get_id(); + + platform = CreatePlatform(); + if (!platform->Initialize(appName, width, height)) { + return false; + } + + // Set resize callback + platform->SetResizeCallback([this](int width, int height) { + HandleResize(width, height); + }); + + // Set mouse callback + platform->SetMouseCallback([this](float x, float y, uint32_t buttons) { + handleMouseInput(x, y, buttons); + }); + + // Set keyboard callback + platform->SetKeyboardCallback([this](uint32_t key, bool pressed) { + handleKeyInput(key, pressed); + }); + + // Set char callback + platform->SetCharCallback([this](uint32_t c) { + if (imguiSystem) { + imguiSystem->HandleChar(c); + } + }); + + // Create renderer + renderer = std::make_unique(platform.get()); + if (!renderer->Initialize(appName, enableValidationLayers, debugSync)) { + return false; + } + + try { + // Model loader via constructor; also wire into renderer + modelLoader = std::make_unique(renderer.get()); + renderer->SetModelLoader(modelLoader.get()); + + // Audio system via constructor + audioSystem = std::make_unique(this, renderer.get()); + + // Physics system via constructor (GPU enabled) + physicsSystem = std::make_unique(renderer.get(), true); + + // ImGui via constructor, then connect audio system + imguiSystem = std::make_unique(renderer.get(), width, height); + imguiSystem->SetAudioSystem(audioSystem.get()); + } catch (const std::exception& e) { + std::cerr << "Subsystem initialization failed: " << e.what() << std::endl; + return false; + } + + // Generate ball material properties once at load time + GenerateBallMaterial(); + + // Initialize physics scaling system + InitializePhysicsScaling(); + + initialized = true; + return true; +#endif +} + +void Engine::Run() { + if (!initialized) { + throw std::runtime_error("Engine not initialized"); + } + + running = true; + + // Main loop + while (running) { + auto startLoop = std::chrono::steady_clock::now(); + + + // Process platform events + if (!platform->ProcessEvents()) { + running = false; + break; + } + + // Calculate delta time + deltaTimeMs = CalculateDeltaTimeMs(); + + // Update frame counter and FPS + frameCount++; + fpsUpdateTimer += deltaTimeMs.count() * 0.001f; + + // Update window title with FPS and frame time every second + if (fpsUpdateTimer >= 1.0f) { + uint64_t framesSinceLastUpdate = frameCount - lastFPSUpdateFrame; + double avgMs = 0.0; + if (framesSinceLastUpdate > 0 && fpsUpdateTimer > 0.0f) { + currentFPS = static_cast(static_cast(framesSinceLastUpdate) / static_cast(fpsUpdateTimer)); + avgMs = (fpsUpdateTimer / static_cast(framesSinceLastUpdate)) * 1000.0; + } else { + // Avoid divide-by-zero; keep previous FPS and estimate avgMs from last delta + currentFPS = std::max(currentFPS, 1.0f); + avgMs = static_cast(deltaTimeMs.count()); + } + + // Update window title with frame count, FPS, and frame time + std::string title = "Simple Engine - Frame: " + std::to_string(frameCount) + + " | FPS: " + std::to_string(static_cast(currentFPS)) + + " | ms: " + std::to_string(static_cast(avgMs)); + platform->SetWindowTitle(title); + + // Reset timer and frame counter for next update + fpsUpdateTimer = 0.0f; + lastFPSUpdateFrame = frameCount; + } + + // Update + Update(deltaTimeMs); + + // Render + Render(); + } +} + +void Engine::Cleanup() { + if (initialized) { + // Wait for the device to be idle before cleaning up + if (renderer) { + renderer->WaitIdle(); + } + + // Clear entities + { + std::unique_lock lk(entitiesMutex); + entities.clear(); + entityMap.clear(); + } + + // Clean up subsystems in reverse order of creation + imguiSystem.reset(); + physicsSystem.reset(); + audioSystem.reset(); + modelLoader.reset(); + renderer.reset(); + platform.reset(); + + initialized = false; + } +} + +Entity* Engine::CreateEntity(const std::string& name) { + std::unique_lock lk(entitiesMutex); + // Always allow duplicate names; map stores a representative entity + // Create the entity + auto entity = std::make_unique(name); + // Add to the vector and map + entities.push_back(std::move(entity)); + Entity* rawPtr = entities.back().get(); + // Update the map to point to the most recently created entity with this name + entityMap[name] = rawPtr; + + return rawPtr; +} + +Entity* Engine::GetEntity(const std::string& name) { + std::shared_lock lk(entitiesMutex); + auto it = entityMap.find(name); + if (it != entityMap.end()) { + return it->second; + } + return nullptr; +} + +bool Engine::RemoveEntity(Entity* entity) { + if (!entity) { + return false; + } + + // If called from a background thread, defer removal to avoid deleting entities + // while the render thread may be iterating a snapshot. + if (!IsMainThread()) { + std::lock_guard lk(pendingEntityRemovalsMutex); + pendingEntityRemovalNames.push_back(entity->GetName()); + return true; + } + + std::unique_lock lk(entitiesMutex); + + // Remember the name before erasing ownership + std::string name = entity->GetName(); + + // Find the entity in the vector + auto it = std::ranges::find_if(entities, + [entity](const std::unique_ptr& e) { + return e.get() == entity; + }); + + if (it != entities.end()) { + // Remove from the vector (ownership) + entities.erase(it); + + // Update the map: point to another entity with the same name if one exists + auto remainingIt = std::ranges::find_if(entities, + [&name](const std::unique_ptr& e) { + return e->GetName() == name; + }); + + if (remainingIt != entities.end()) { + entityMap[name] = remainingIt->get(); + } else { + entityMap.erase(name); + } + + return true; + } + + return false; +} + +bool Engine::RemoveEntity(const std::string& name) { + // If called from a background thread, defer removal to avoid deleting entities + // while the render thread may be iterating a snapshot. + if (!IsMainThread()) { + std::lock_guard lk(pendingEntityRemovalsMutex); + pendingEntityRemovalNames.push_back(name); + return true; + } + + std::unique_lock lk(entitiesMutex); + auto it = entityMap.find(name); + if (it == entityMap.end()) + return false; + Entity* entity = it->second; + if (!entity) + return false; + + // Find the entity in the vector + auto vecIt = std::ranges::find_if(entities, + [entity](const std::unique_ptr& e) { + return e.get() == entity; + }); + if (vecIt == entities.end()) { + entityMap.erase(name); + return false; + } + + entities.erase(vecIt); + + // Update the map: point to another entity with the same name if one exists + auto remainingIt = std::ranges::find_if(entities, + [&name](const std::unique_ptr& e) { + return e && e->GetName() == name; + }); + if (remainingIt != entities.end()) { + entityMap[name] = remainingIt->get(); + } else { + entityMap.erase(name); + } + return true; +} + +void Engine::SetActiveCamera(CameraComponent* cameraComponent) { + activeCamera = cameraComponent; +} + +const CameraComponent* Engine::GetActiveCamera() const { + return activeCamera; +} + +const ResourceManager* Engine::GetResourceManager() const { + return resourceManager.get(); +} + +const Platform* Engine::GetPlatform() const { + return platform.get(); +} + +Renderer* Engine::GetRenderer() { + return renderer.get(); +} + +ModelLoader* Engine::GetModelLoader() { + return modelLoader.get(); +} + +const AudioSystem* Engine::GetAudioSystem() const { + return audioSystem.get(); +} + +PhysicsSystem* Engine::GetPhysicsSystem() { + return physicsSystem.get(); +} + +const ImGuiSystem* Engine::GetImGuiSystem() const { + return imguiSystem.get(); +} + +void Engine::handleMouseInput(float x, float y, uint32_t buttons) { + // Check if ImGui wants to capture mouse input first + bool imguiWantsMouse = imguiSystem && imguiSystem->WantCaptureMouse(); + + // Suppress right-click while loading + if (renderer&& renderer + + -> + IsLoading() + ) { + buttons &= ~2u; // clear right button bit + } + + if (!imguiWantsMouse) { + // Handle mouse click for ball throwing (right mouse button) + if (buttons & 2) { + // Right mouse button (bit 1) + if (!cameraControl.mouseRightPressed) { + cameraControl.mouseRightPressed = true; + // Throw a ball on mouse click + ThrowBall(x, y); + } + } else { + cameraControl.mouseRightPressed = false; + } + + // Handle camera rotation when left mouse button is pressed + if (buttons & 1) { + // Left mouse button (bit 0) + if (!cameraControl.mouseLeftPressed) { + cameraControl.mouseLeftPressed = true; + cameraControl.firstMouse = true; + } + + if (cameraControl.firstMouse) { + cameraControl.lastMouseX = x; + cameraControl.lastMouseY = y; + cameraControl.firstMouse = false; + } + + float xOffset = x - cameraControl.lastMouseX; + float yOffset = y - cameraControl.lastMouseY; + cameraControl.lastMouseX = x; + cameraControl.lastMouseY = y; + + xOffset *= cameraControl.mouseSensitivity; + yOffset *= cameraControl.mouseSensitivity; + + // Mouse look: positive X moves view to the right; positive Y moves view up. + // Platform mouse coordinates increase downward, so invert Y. + cameraControl.yaw -= xOffset; + cameraControl.pitch -= yOffset; + + // Constrain pitch to avoid gimbal lock + if (cameraControl.pitch > 89.0f) + cameraControl.pitch = 89.0f; + if (cameraControl.pitch < -89.0f) + cameraControl.pitch = -89.0f; + } else { + cameraControl.mouseLeftPressed = false; + } + } + + if (imguiSystem) { + imguiSystem->HandleMouse(x, y, buttons); + } + + // Always perform hover detection (even when ImGui is active) + HandleMouseHover(x, y); +} +void Engine::handleKeyInput(uint32_t key, bool pressed) { +#if !defined(PLATFORM_ANDROID) + switch (key) { + case GLFW_KEY_W: + case GLFW_KEY_UP: + cameraControl.moveForward = pressed; + break; + case GLFW_KEY_S: + case GLFW_KEY_DOWN: + cameraControl.moveBackward = pressed; + break; + case GLFW_KEY_A: + case GLFW_KEY_LEFT: + cameraControl.moveLeft = pressed; + break; + case GLFW_KEY_D: + case GLFW_KEY_RIGHT: + cameraControl.moveRight = pressed; + break; + case GLFW_KEY_Q: + case GLFW_KEY_PAGE_UP: + cameraControl.moveUp = pressed; + break; + case GLFW_KEY_E: + case GLFW_KEY_PAGE_DOWN: + cameraControl.moveDown = pressed; + break; + default: + break; + } + + if (imguiSystem) { + imguiSystem->HandleKeyboard(key, pressed); + } +#else + // Android uses different input handling via touch events + (void) key; + (void) pressed; +#endif +} + +void Engine::Update(TimeDelta deltaTime) { + // Apply any entity removals requested by background threads. + ProcessPendingEntityRemovals(); + + const bool loading = renderer && renderer->IsLoading(); + + if (!loading) { + ProcessPendingBalls(); + + if (activeCamera) { + glm::vec3 currentCameraPosition = activeCamera->GetPosition(); + physicsSystem->SetCameraPosition(currentCameraPosition); + } + } + + // Derive the next frame's base timeline value from the renderer's CURRENT timeline + // to avoid races where Render advanced farther than our "next" snapshot. + uint64_t currentTimeline = renderer ? renderer->GetCurrentTimelineValue() : 0; + // Ensure we round to the next available 10-unit base. + uint64_t nextFrameBaseValue = ((currentTimeline / 10ULL) + 1ULL) * 10ULL; + uint64_t physicsSignalValue = nextFrameBaseValue + Renderer::TimelineMilestones::ePhysicsFinished; + + physicsSystem->Update(deltaTime, physicsSignalValue, static_cast(nextFrameBaseValue / 10ULL)); + + if (!loading) { + audioSystem->Update(deltaTime); + } + + if (imguiSystem) { + imguiSystem->NewFrame(); + } + + if (!loading) { + if (activeCamera) { + UpdateCameraControls(deltaTime); + } + + std::vector snapshot; { + std::shared_lock lk(entitiesMutex); + snapshot.reserve(entities.size()); + for (auto& uptr : entities) { + snapshot.push_back(uptr.get()); + } + } + for (Entity* entity : snapshot) { + if (!entity || !entity->IsActive()) + continue; + entity->Update(deltaTime); + } + } +} + +void Engine::Render() { + // Ensure renderer is ready + if (!renderer || !renderer->IsInitialized()) { + return; + } + + // Check if we have an active camera + if (!activeCamera) { + return; + } + + // Apply any entity removals requested by background threads before taking a snapshot. + ProcessPendingEntityRemovals(); + + // Snapshot entity pointers under a short shared lock, then release the lock + // before rendering. This prevents starving the background loader/physics threads + // that need the unique lock to create entities/components. + std::vector snapshot; { + std::shared_lock lk(entitiesMutex); + snapshot.reserve(entities.size()); + for (auto& uptr : entities) { + snapshot.push_back(uptr.get()); + } + } + + // Render the scene (ImGui will be rendered within the render pass) + renderer->Render(snapshot, activeCamera, imguiSystem.get()); +} + +std::chrono::milliseconds Engine::CalculateDeltaTimeMs() { + // Get current time using a steady clock to avoid system time jumps + uint64_t currentTime = static_cast( + std::chrono::duration_cast( + std::chrono::steady_clock::now().time_since_epoch()) + .count()); + + // Initialize lastFrameTimeMs on first call + if (lastFrameTimeMs == 0) { + lastFrameTimeMs = currentTime; + return std::chrono::milliseconds(16); // ~16ms as a sane initial guess + } + + // Calculate delta time in milliseconds + uint64_t delta = currentTime - lastFrameTimeMs; + + // Update last frame time + lastFrameTimeMs = currentTime; + + return std::chrono::milliseconds(static_cast(delta)); +} + +void Engine::HandleResize(int width, int height) const { + if (height <= 0 || width <= 0) { + return; + } + // Update the active camera's aspect ratio + if (activeCamera) { + activeCamera->SetAspectRatio(static_cast(width) / static_cast(height)); + } + + // Notify the renderer that the framebuffer has been resized + if (renderer) { + renderer->SetFramebufferResized(); + } + + // Notify ImGui system about the resize + if (imguiSystem) { + imguiSystem->HandleResize(static_cast(width), static_cast(height)); + } +} + +void Engine::UpdateCameraControls(TimeDelta deltaTime) { + if (!activeCamera) + return; + + // Get a camera transform component + auto* cameraTransform = activeCamera->GetOwner()->GetComponent(); + if (!cameraTransform) + return; + + // Check if camera tracking is enabled + if (imguiSystem&& imguiSystem + + -> + IsCameraTrackingEnabled() + ) { + // Find the first active ball entity + Entity* ballEntity = nullptr; { + std::shared_lock lk(entitiesMutex); + auto ballEntityIt = std::ranges::find_if(entities, + [](auto const& entity) { + return entity && entity->IsActive() && (entity->GetName().find("Ball_") != std::string::npos); + }); + ballEntity = (ballEntityIt != entities.end()) ? ballEntityIt->get() : nullptr; + } + + if (ballEntity) { + // Get ball's transform component + auto* ballTransform = ballEntity->GetComponent(); + if (ballTransform) { + glm::vec3 ballPosition = ballTransform->GetPosition(); + + // Position camera at a fixed offset from the ball for good viewing + glm::vec3 cameraOffset = glm::vec3(2.0f, 1.5f, 2.0f); // Behind and above the ball + glm::vec3 cameraPosition = ballPosition + cameraOffset; + + // Update camera position and target + cameraTransform->SetPosition(cameraPosition); + activeCamera->SetTarget(ballPosition); + + return; // Skip manual controls when tracking + } + } + } + + // Manual camera controls (only when tracking is disabled) + // Calculate movement speed + float velocity = cameraControl.cameraSpeed * deltaTime.count() * .001f; + + // Capture base orientation from GLTF camera once and then apply mouse deltas relative to it + if (!cameraControl.baseOrientationCaptured) { + // TransformComponent stores Euler in radians; convert to quaternion + glm::vec3 baseEuler = cameraTransform->GetRotation(); + const glm::quat qx = glm::angleAxis(baseEuler.x, glm::vec3(1.0f, 0.0f, 0.0f)); + const glm::quat qy = glm::angleAxis(baseEuler.y, glm::vec3(0.0f, 1.0f, 0.0f)); + const glm::quat qz = glm::angleAxis(baseEuler.z, glm::vec3(0.0f, 0.0f, 1.0f)); + // Match CameraComponent::UpdateViewMatrix composition (q = qz * qy * qx) + cameraControl.baseOrientation = qz * qy * qx; + cameraControl.baseOrientationCaptured = true; + } + + // Build delta orientation from yaw/pitch mouse deltas (degrees -> radians) + const float yawRad = glm::radians(cameraControl.yaw); + const float pitchRad = glm::radians(cameraControl.pitch); + const glm::quat qDeltaY = glm::angleAxis(yawRad, glm::vec3(0.0f, 1.0f, 0.0f)); + const glm::quat qDeltaX = glm::angleAxis(pitchRad, glm::vec3(1.0f, 0.0f, 0.0f)); + // Apply yaw then pitch in the same convention as CameraComponent (ZYX overall), so delta = Ry * Rx + glm::quat qDelta = qDeltaY * qDeltaX; + glm::quat qFinal = cameraControl.baseOrientation * qDelta; + + // Derive camera basis directly from rotated axes to avoid ambiguity + glm::vec3 right = glm::normalize(qFinal * glm::vec3(1.0f, 0.0f, 0.0f)); + glm::vec3 up = glm::normalize(qFinal * glm::vec3(0.0f, 1.0f, 0.0f)); + // Camera forward in world space. + // Our view/projection conventions assume the camera looks down -Z in its local space. + glm::vec3 front = glm::normalize(qFinal * glm::vec3(0.0f, 0.0f, -1.0f)); + + // Get the current camera position + glm::vec3 position = cameraTransform->GetPosition(); + + // Apply movement based on input + if (cameraControl.moveForward) { + position += front * velocity; + } + if (cameraControl.moveBackward) { + position -= front * velocity; + } + if (cameraControl.moveLeft) { + position -= right * velocity; + } + if (cameraControl.moveRight) { + position += right * velocity; + } + if (cameraControl.moveUp) { + position += up * velocity; + } + if (cameraControl.moveDown) { + position -= up * velocity; + } + + // Update camera position + cameraTransform->SetPosition(position); + // Apply rotation to the camera transform based on GLTF base orientation plus mouse deltas + // TransformComponent expects radians Euler (ZYX order in our CameraComponent). + cameraTransform->SetRotation(glm::eulerAngles(qFinal)); + + // Update camera target based on a direction + glm::vec3 target = position + front; + activeCamera->SetTarget(target); + + // Ensure the camera view matrix reflects the new transform immediately this frame + activeCamera->ForceViewMatrixUpdate(); +} + +void Engine::GenerateBallMaterial() { + // Generate 8 random material properties for PBR + std::random_device rd; + std::mt19937 gen(rd()); + std::uniform_real_distribution dis(0.0f, 1.0f); + + // Generate bright, vibrant albedo colors for better visibility + std::uniform_real_distribution brightDis(0.6f, 1.0f); // Ensure bright colors + ballMaterial.albedo = glm::vec3(brightDis(gen), brightDis(gen), brightDis(gen)); + + // Random metallic value (0.0 to 1.0) + ballMaterial.metallic = dis(gen); + + // Random roughness value (0.0 to 1.0) + ballMaterial.roughness = dis(gen); + + // Random ambient occlusion (typically 0.8 to 1.0 for good lighting) + ballMaterial.ao = 0.8f + dis(gen) * 0.2f; + + // Random emissive color (usually subtle) + ballMaterial.emissive = glm::vec3(dis(gen) * 0.3f, dis(gen) * 0.3f, dis(gen) * 0.3f); + + // Decent bounciness (0.6 to 0.9) so bounces are clearly visible + ballMaterial.bounciness = 0.6f + dis(gen) * 0.3f; +} + +void Engine::InitializePhysicsScaling() { + // Based on issue analysis: balls reaching 120+ m/s and extreme positions like (-244, -360, -244) + // The previous 200.0f force scale was causing supersonic speeds and balls flying out of scene + // Need much more conservative scaling for realistic visual gameplay + + // Use smaller game unit scale for more controlled physics + physicsScaling.gameUnitsToMeters = 0.1f; // 1 game unit = 0.1 meter (10cm) - smaller scale + + // Much reduced force scaling to prevent extreme speeds + // With base forces 0.01f-0.05f, this gives final forces of 0.001f-0.005f + physicsScaling.forceScale = 1.0f; // Minimal force scaling for realistic movement + physicsScaling.physicsTimeScale = 1.0f; // Keep time scale normal + physicsScaling.gravityScale = 1.0f; // Keep gravity proportional to scale + + // Apply scaled gravity to physics system + glm::vec3 realWorldGravity(0.0f, -9.81f, 0.0f); + glm::vec3 scaledGravity = ScaleGravityForPhysics(realWorldGravity); + physicsSystem->SetGravity(scaledGravity); +} + +float Engine::ScaleForceForPhysics(float gameForce) const { + // Scale force based on the relationship between game units and real world + // and the force scaling factor to make physics feel right + return gameForce * physicsScaling.forceScale * physicsScaling.gameUnitsToMeters; +} + +glm::vec3 Engine::ScaleGravityForPhysics(const glm::vec3& realWorldGravity) const { + // Scale gravity based on game unit scale and gravity scaling factor + // If 1 game unit = 1 meter, then gravity should remain -9.81 + // If 1 game unit = 0.1 meter, then gravity should be -0.981 + return realWorldGravity * physicsScaling.gravityScale * physicsScaling.gameUnitsToMeters; +} + +float Engine::ScaleTimeForPhysics(float deltaTime) const { + // Scale time for physics simulation if needed + // This can be used to slow down or speed up physics relative to rendering + return deltaTime * physicsScaling.physicsTimeScale; +} + +void Engine::ThrowBall(float mouseX, float mouseY) { + if (!activeCamera || !physicsSystem) { + return; + } + + // Get window dimensions + int windowWidth, windowHeight; + platform->GetWindowSize(&windowWidth, &windowHeight); + + // Convert mouse coordinates to normalized device coordinates (-1 to 1) + float ndcX = (2.0f * mouseX) / static_cast(windowWidth) - 1.0f; + float ndcY = 1.0f - (2.0f * mouseY) / static_cast(windowHeight); + + // Get camera matrices + glm::mat4 viewMatrix = activeCamera->GetViewMatrix(); + glm::mat4 projMatrix = activeCamera->GetProjectionMatrix(); + + // Calculate inverse matrices + glm::mat4 invView = glm::inverse(viewMatrix); + glm::mat4 invProj = glm::inverse(projMatrix); + + // Convert NDC to world space for direction + glm::vec4 rayClip = glm::vec4(ndcX, ndcY, -1.0f, 1.0f); + glm::vec4 rayEye = invProj * rayClip; + rayEye = glm::vec4(rayEye.x, rayEye.y, -1.0f, 0.0f); + glm::vec4 rayWorld = invView * rayEye; + + // Calculate screen center in world coordinates + // Screen center is at NDC (0, 0) which corresponds to the center of the view + glm::vec4 screenCenterClip = glm::vec4(0.0f, 0.0f, -1.0f, 1.0f); + glm::vec4 screenCenterEye = invProj * screenCenterClip; + screenCenterEye = glm::vec4(screenCenterEye.x, screenCenterEye.y, -1.0f, 0.0f); + glm::vec4 screenCenterWorld = invView * screenCenterEye; + glm::vec3 screenCenterDirection = glm::normalize(glm::vec3(screenCenterWorld)); + + // Calculate world position for screen center at a reasonable distance from camera + glm::vec3 cameraPosition = activeCamera->GetPosition(); + glm::vec3 screenCenterWorldPos = cameraPosition + screenCenterDirection * 2.0f; // 2 units in front of camera + + // Calculate throw direction from screen center toward mouse position + glm::vec3 throwDirection = glm::normalize(glm::vec3(rayWorld)); + + // Add upward component for realistic arc trajectory + throwDirection.y += 0.3f; // Add upward bias for throwing arc + throwDirection = glm::normalize(throwDirection); // Re-normalize after modification + + // Generate ball properties now + static int ballCounter = 0; + std::string ballName = "Ball_" + std::to_string(ballCounter++); + + std::random_device rd; + std::mt19937 gen(rd()); + + // Launch balls from screen center toward mouse cursor + glm::vec3 spawnPosition = screenCenterWorldPos; + + // Add small random variation to avoid identical paths + std::uniform_real_distribution posDis(-0.1f, 0.1f); + spawnPosition.x += posDis(gen); + spawnPosition.y += posDis(gen); + spawnPosition.z += posDis(gen); + + std::uniform_real_distribution spinDis(-10.0f, 10.0f); + std::uniform_real_distribution forceDis(15.0f, 35.0f); // Stronger force range for proper throwing feel + + // Store ball creation data for processing outside rendering loop + PendingBall pendingBall; + pendingBall.spawnPosition = spawnPosition; + pendingBall.throwDirection = throwDirection; // This is now the corrected direction toward geometry + pendingBall.throwForce = ScaleForceForPhysics(forceDis(gen)); // Apply physics scaling to force + pendingBall.randomSpin = glm::vec3(spinDis(gen), spinDis(gen), spinDis(gen)); + pendingBall.ballName = ballName; + + pendingBalls.push_back(pendingBall); +} + +void Engine::ProcessPendingBalls() { + // Process all pending balls + for (const auto& pendingBall : pendingBalls) { + // Create ball entity + Entity* ballEntity = CreateEntity(pendingBall.ballName); + if (!ballEntity) { + std::cerr << "Failed to create ball entity: " << pendingBall.ballName << std::endl; + continue; + } + + // Add transform component + auto* transform = ballEntity->AddComponent(); + if (!transform) { + std::cerr << "Failed to add TransformComponent to ball: " << pendingBall.ballName << std::endl; + continue; + } + transform->SetPosition(pendingBall.spawnPosition); + transform->SetScale(glm::vec3(1.0f)); // Tennis ball size scale + + // Add mesh component with sphere geometry + auto* mesh = ballEntity->AddComponent(); + if (!mesh) { + std::cerr << "Failed to add MeshComponent to ball: " << pendingBall.ballName << std::endl; + continue; + } + // Create tennis ball-sized, bright red sphere + glm::vec3 brightRed(1.0f, 0.0f, 0.0f); + mesh->CreateSphere(0.0335f, brightRed, 32); // Tennis ball radius, bright color, high detail + mesh->SetTexturePath(renderer->SHARED_BRIGHT_RED_ID); // Use bright red texture for visibility + + // Verify mesh geometry was created + const auto& vertices = mesh->GetVertices(); + const auto& indices = mesh->GetIndices(); + if (vertices.empty() || indices.empty()) { + std::cerr << "ERROR: CreateSphere failed to generate geometry!" << std::endl; + continue; + } + + // Pre-allocate Vulkan resources for this entity (now outside rendering loop) + if (!renderer->preAllocateEntityResources(ballEntity)) { + std::cerr << "Failed to pre-allocate resources for ball: " << pendingBall.ballName << std::endl; + continue; + } + + // Create rigid body with sphere collision shape + RigidBody* rigidBody = physicsSystem->CreateRigidBody(ballEntity, CollisionShape::Sphere, 1.0f); + if (rigidBody) { + // Set bounciness from material + rigidBody->SetRestitution(ballMaterial.bounciness); + + // Request an acceleration structure build so the new ball is included in Ray Query mode. + // We do this after creating the rigid body and initializing the entity. + renderer->RequestAccelerationStructureBuild("Ball spawned"); + + // Apply throw force and spin + glm::vec3 throwImpulse = pendingBall.throwDirection * pendingBall.throwForce; + rigidBody->ApplyImpulse(throwImpulse, glm::vec3(0.0f)); + rigidBody->SetAngularVelocity(pendingBall.randomSpin); + } + } + + // Clear processed balls + pendingBalls.clear(); +} + +void Engine::HandleMouseHover(float mouseX, float mouseY) { + // Update current mouse position for any systems that might need it + currentMouseX = mouseX; + currentMouseY = mouseY; +} + +#if defined(PLATFORM_ANDROID) +// Android-specific implementation +bool Engine::InitializeAndroid(android_app* app, const std::string& appName, bool enableValidationLayers, bool debugSync) { + // Create platform + platform = CreatePlatform(app); + if (!platform->Initialize(appName, 0, 0)) { + return false; + } + + // Set resize callback + platform->SetResizeCallback([this](int width, int height) { + HandleResize(width, height); + }); + + // Set mouse callback + platform->SetMouseCallback([this](float x, float y, uint32_t buttons) { + // Check if ImGui wants to capture mouse input first + bool imguiWantsMouse = imguiSystem && imguiSystem->WantCaptureMouse(); + + if (!imguiWantsMouse) { + // Handle mouse click for ball throwing (right mouse button) + if (buttons & 2) { + // Right mouse button (bit 1) + if (!cameraControl.mouseRightPressed) { + cameraControl.mouseRightPressed = true; + // Throw a ball on mouse click + ThrowBall(x, y); + } + } else { + cameraControl.mouseRightPressed = false; + } + } + + if (imguiSystem) { + imguiSystem->HandleMouse(x, y, buttons); + } + }); + + // Set keyboard callback + platform->SetKeyboardCallback([this](uint32_t key, bool pressed) { + if (imguiSystem) { + imguiSystem->HandleKeyboard(key, pressed); + } + }); + + // Set char callback + platform->SetCharCallback([this](uint32_t c) { + if (imguiSystem) { + imguiSystem->HandleChar(c); + } + }); + + // Create renderer + renderer = std::make_unique(platform.get()); + if (!renderer->Initialize(appName, enableValidationLayers, debugSync)) { + return false; + } + + // Get window dimensions from platform for ImGui initialization + int width, height; + platform->GetWindowSize(&width, &height); + + try { + // Model loader via constructor; also wire into renderer + modelLoader = std::make_unique(renderer.get()); + renderer->SetModelLoader(modelLoader.get()); + + // Audio system via constructor + audioSystem = std::make_unique(this, renderer.get()); + + // Physics system via constructor (GPU enabled) + physicsSystem = std::make_unique(renderer.get(), true); + + // ImGui via constructor, then connect audio system + imguiSystem = std::make_unique(renderer.get(), width, height); + imguiSystem->SetAudioSystem(audioSystem.get()); + } catch (const std::exception& e) { + std::cerr << "Subsystem initialization failed: " << e.what() << std::endl; + return false; + } + + // Generate ball material properties once at load time + GenerateBallMaterial(); + + // Initialize physics scaling system + InitializePhysicsScaling(); + + initialized = true; + return true; +} + +void Engine::RunAndroid() { + if (!initialized) { + throw std::runtime_error("Engine not initialized"); + } + + running = true; + + // Main loop is handled by the platform + // We just need to update and render when the platform is ready + + // Calculate delta time + deltaTimeMs = CalculateDeltaTimeMs(); + + // Update + Update(deltaTimeMs); + + // Render + Render(); +} +#endif diff --git a/attachments/sync2_engine/engine.h b/attachments/sync2_engine/engine.h new file mode 100644 index 00000000..83097541 --- /dev/null +++ b/attachments/sync2_engine/engine.h @@ -0,0 +1,402 @@ +/* Copyright (c) 2025 Holochip Corporation + * + * SPDX-License-Identifier: Apache-2.0 + * + * Licensed under the Apache License, Version 2.0 the "License"; + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +#pragma once + +#include +#include +#include +#include +#include +#include +#include +#include + +#include "audio_system.h" +#include "camera_component.h" +#include "entity.h" +#include "imgui_system.h" +#include "model_loader.h" +#include "physics_system.h" +#include "platform.h" +#include "renderer.h" +#include "resource_manager.h" + +/** + * @brief Main engine class that manages the game loop and subsystems. + * + * This class implements the core engine architecture as described in the Engine_Architecture chapter: + * @see en/Building_a_Simple_Engine/Engine_Architecture/02_architectural_patterns.adoc + */ +class Engine +{ + public: + using TimeDelta = std::chrono::milliseconds; + /** + * @brief Default constructor. + */ + Engine(); + + /** + * @brief Destructor for proper cleanup. + */ + ~Engine(); + + /** + * @brief Initialize the engine. + * @param appName The name of the application. + * @param width The width of the window. + * @param height The height of the window. + * @param enableValidationLayers Whether to enable Vulkan validation layers. + * @param debugSync Whether to enable Vulkan synchronization validation. + * @return True if initialization was successful, false otherwise. + */ + bool Initialize(const std::string &appName, int width, int height, bool enableValidationLayers = true, bool debugSync = false); + + /** + * @brief Run the main game loop. + */ + void Run(); + + /** + * @brief Clean up engine resources. + */ + void Cleanup(); + + /** + * @brief Create a new entity. + * @param name The name of the entity. + * @return A pointer to the newly created entity. + */ + Entity *CreateEntity(const std::string &name); + + /** + * @brief Get an entity by name. + * @param name The name of the entity. + * @return A pointer to the entity, or nullptr if not found. + */ + Entity *GetEntity(const std::string &name); + + /** + * @brief Get all entities. + * @return A const reference to the vector of entities. + */ + const std::vector> &GetEntities() const + { + return entities; + } + + /** + * @brief Remove an entity. + * @param entity The entity to remove. + * @return True if the entity was removed, false otherwise. + */ + bool RemoveEntity(Entity *entity); + + /** + * @brief Remove an entity by name. + * @param name The name of the entity to remove. + * @return True if the entity was removed, false otherwise. + */ + bool RemoveEntity(const std::string &name); + + /** + * @brief Set the active camera. + * @param cameraComponent The camera component to set as active. + */ + void SetActiveCamera(CameraComponent *cameraComponent); + + /** + * @brief Get the active camera. + * @return A pointer to the active camera component, or nullptr if none is set. + */ + const CameraComponent *GetActiveCamera() const; + + /** + * @brief Get the resource manager. + * @return A pointer to the resource manager. + */ + const ResourceManager *GetResourceManager() const; + + /** + * @brief Get the platform. + * @return A pointer to the platform. + */ + const Platform *GetPlatform() const; + + /** + * @brief Get the renderer. + * @return A pointer to the renderer. + */ + Renderer *GetRenderer(); + + /** + * @brief Get the model loader. + * @return A pointer to the model loader. + */ + ModelLoader *GetModelLoader(); + + /** + * @brief Get the audio system. + * @return A pointer to the audio system. + */ + const AudioSystem *GetAudioSystem() const; + + /** + * @brief Get the physics system. + * @return A pointer to the physics system. + */ + PhysicsSystem *GetPhysicsSystem(); + + /** + * @brief Get the ImGui system. + * @return A pointer to the ImGui system. + */ + const ImGuiSystem *GetImGuiSystem() const; + + /** + * @brief Handles mouse input for interaction and camera control. + * + * This method processes mouse input for various functionalities, including interacting with the scene, + * camera rotation, and delegating handling to ImGui or hover systems. + * + * @param x The x-coordinate of the mouse position. + * @param y The y-coordinate of the mouse position. + * @param buttons A bitmask representing the state of mouse buttons. + * Bit 0 corresponds to the left button, and Bit 1 corresponds to the right button. + */ + void handleMouseInput(float x, float y, uint32_t buttons); + + /** + * @brief Handles keyboard input events for controlling the camera and other subsystems. + * + * This method processes key press and release events to update the camera's movement state. + * It also forwards the input to other subsystems like the ImGui interface if applicable. + * + * @param key The key code of the keyboard input. + * @param pressed Indicates whether the key is pressed (true) or released (false). + */ + void handleKeyInput(uint32_t key, bool pressed); + +#if defined(PLATFORM_ANDROID) +/** + * @brief Initialize the engine for Android. + * @param app The Android app. + * @param appName The name of the application. + * @param enableValidationLayers Whether to enable Vulkan validation layers. + * @param debugSync Whether to enable Vulkan synchronization validation. + * @return True if initialization was successful, false otherwise. + */ + bool InitializeAndroid(android_app *app, const std::string &appName, bool enableValidationLayers = true, bool debugSync = false); + + /** + * @brief Run the engine on Android. + */ + void RunAndroid(); +#endif + + private: + // Subsystems + std::unique_ptr platform; + std::unique_ptr renderer; + std::unique_ptr resourceManager; + std::unique_ptr modelLoader; + std::unique_ptr audioSystem; + std::unique_ptr physicsSystem; + std::unique_ptr imguiSystem; + + // Entities + // NOTE: Entities can be created from a background loading thread (see `main.cpp`). + // Protect the containers to avoid iterator invalidation/data races while the render thread + // iterates them. + mutable std::shared_mutex entitiesMutex; + std::vector> entities; + std::unordered_map entityMap; + + // Main thread identity (used to defer destructive operations from background threads) + std::thread::id mainThreadId{}; + + // Background threads may request entity removal while the render thread is iterating snapshots. + // To keep `Entity*` snapshots safe, defer removals to the main thread at a safe point. + std::mutex pendingEntityRemovalsMutex; + std::vector pendingEntityRemovalNames; + void ProcessPendingEntityRemovals(); + bool IsMainThread() const; + + // Active camera + CameraComponent *activeCamera = nullptr; + + // Engine state + bool initialized = false; + bool running = false; + + // Delta time calculation + // deltaTimeMs: time since last frame in milliseconds (for clarity) + std::chrono::milliseconds deltaTimeMs{0}; + uint64_t lastFrameTimeMs = 0; + + // Frame counter and FPS calculation + uint64_t frameCount = 0; + float fpsUpdateTimer = 0.0f; + float currentFPS = 0.0f; + uint64_t lastFPSUpdateFrame = 0; + + // Camera control state + struct CameraControlState + { + bool moveForward = false; + bool moveBackward = false; + bool moveLeft = false; + bool moveRight = false; + bool moveUp = false; + bool moveDown = false; + bool mouseLeftPressed = false; + bool mouseRightPressed = false; + float lastMouseX = 0.0f; + float lastMouseY = 0.0f; + float yaw = 0.0f; + float pitch = 0.0f; + bool firstMouse = true; + float cameraSpeed = 5.0f; + float mouseSensitivity = 0.1f; + bool baseOrientationCaptured = false; + glm::quat baseOrientation{1.0f, 0.0f, 0.0f, 0.0f}; + } cameraControl; + + // Mouse position tracking + float currentMouseX = 0.0f; + float currentMouseY = 0.0f; + + // Ball material properties for PBR + struct BallMaterial + { + glm::vec3 albedo; + float metallic; + float roughness; + float ao; + glm::vec3 emissive; + float bounciness; + }; + + BallMaterial ballMaterial; + + // Physics scaling configuration + // The bistro scene spans roughly 20 game units and represents a realistic cafe/bistro space + // Based on issue feedback: game units should NOT equal 1m and need proper scaling + // Analysis shows bistro geometry pieces are much smaller than assumed + struct PhysicsScaling + { + float gameUnitsToMeters = 0.1f; // 1 game unit = 0.1 meter (10cm) - more realistic scale + float physicsTimeScale = 1.0f; // Normal time scale for stable physics + float forceScale = 2.0f; // Much reduced force scaling for visual gameplay (was 10.0f) + float gravityScale = 0.1f; // Scaled gravity for smaller world scale + }; + + PhysicsScaling physicsScaling; + + // Pending ball creation data + struct PendingBall + { + glm::vec3 spawnPosition; + glm::vec3 throwDirection; + float throwForce; + glm::vec3 randomSpin; + std::string ballName; + }; + + std::vector pendingBalls; + + /** + * @brief Update the engine state. + * @param deltaTime The time elapsed since the last update. + */ + // Accepts a time delta in milliseconds for clarity + void Update(TimeDelta deltaTime); + + /** + * @brief Render the scene. + */ + void Render(); + + /** + * @brief Calculate the time delta between frames. + * @return The delta time in milliseconds (steady_clock based). + */ + std::chrono::milliseconds CalculateDeltaTimeMs(); + + /** + * @brief Handle window resize events. + * @param width The new width of the window. + * @param height The new height of the window. + */ + void HandleResize(int width, int height) const; + + /** + * @brief Update camera controls based on input state. + * @param deltaTime The time elapsed since the last update. + */ + void UpdateCameraControls(TimeDelta deltaTime); + + /** + * @brief Generate random PBR material properties for the ball. + */ + void GenerateBallMaterial(); + + /** + * @brief Initialize physics scaling based on scene analysis. + */ + void InitializePhysicsScaling(); + + /** + * @brief Convert a force value from game units to physics units. + * @param gameForce Force in game units. + * @return Force scaled for physics simulation. + */ + float ScaleForceForPhysics(float gameForce) const; + + /** + * @brief Convert gravity from real-world units to game physics units. + * @param realWorldGravity Gravity in m/s². + * @return Gravity scaled for game physics. + */ + glm::vec3 ScaleGravityForPhysics(const glm::vec3 &realWorldGravity) const; + + /** + * @brief Convert time delta for physics simulation. + * @param deltaTime Real delta time. + * @return Scaled delta time for physics. + */ + float ScaleTimeForPhysics(float deltaTime) const; + + /** + * @brief Throw a ball into the scene with random properties. + * @param mouseX The x-coordinate of the mouse click. + * @param mouseY The y-coordinate of the mouse click. + */ + void ThrowBall(float mouseX, float mouseY); + + /** + * @brief Process pending ball creations outside the rendering loop. + */ + void ProcessPendingBalls(); + + /** + * @brief Handle mouse hover to track current mouse position. + * @param mouseX The x-coordinate of the mouse position. + * @param mouseY The y-coordinate of the mouse position. + */ + void HandleMouseHover(float mouseX, float mouseY); +}; diff --git a/attachments/sync2_engine/imgui_system.cpp b/attachments/sync2_engine/imgui_system.cpp new file mode 100644 index 00000000..95fb1f36 --- /dev/null +++ b/attachments/sync2_engine/imgui_system.cpp @@ -0,0 +1,1096 @@ +/* Copyright (c) 2025 Holochip Corporation + * + * SPDX-License-Identifier: Apache-2.0 + * + * Licensed under the Apache License, Version 2.0 the "License"; + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +#include "imgui_system.h" +#include "audio_system.h" +#include "renderer.h" + +// Include ImGui headers +#include "imgui/imgui.h" + +#include + +// This implementation corresponds to the GUI chapter in the tutorial: +// @see en/Building_a_Simple_Engine/GUI/02_imgui_setup.adoc + +ImGuiSystem::ImGuiSystem() { + // Constructor implementation +} + +ImGuiSystem::~ImGuiSystem() { + // Destructor implementation + Cleanup(); +} + +bool ImGuiSystem::Initialize(Renderer* renderer, uint32_t width, uint32_t height) { + if (initialized) { + return true; + } + + this->renderer = renderer; + this->width = width; + this->height = height; + + // Create ImGui context + context = ImGui::CreateContext(); + if (!context) { + std::cerr << "Failed to create ImGui context" << std::endl; + return false; + } + + // Configure ImGui + ImGuiIO& io = ImGui::GetIO(); + // Set display size + io.DisplaySize = ImVec2(static_cast(width), static_cast(height)); + io.DisplayFramebufferScale = ImVec2(1.0f, 1.0f); + + // Set up ImGui style + ImGui::StyleColorsDark(); + + // Create Vulkan resources + if (!createResources()) { + std::cerr << "Failed to create ImGui Vulkan resources" << std::endl; + Cleanup(); + return false; + } + + // Initialize per-frame buffers containers + if (renderer) { + uint32_t frames = renderer->GetMaxFramesInFlight(); + vertexBuffers.clear(); + vertexBuffers.reserve(frames); + vertexBufferMemories.clear(); + vertexBufferMemories.reserve(frames); + indexBuffers.clear(); + indexBuffers.reserve(frames); + indexBufferMemories.clear(); + indexBufferMemories.reserve(frames); + for (uint32_t i = 0; i < frames; ++i) { + vertexBuffers.emplace_back(nullptr); + vertexBufferMemories.emplace_back(nullptr); + indexBuffers.emplace_back(nullptr); + indexBufferMemories.emplace_back(nullptr); + } + vertexCounts.assign(frames, 0); + indexCounts.assign(frames, 0); + } + + initialized = true; + return true; +} + +void ImGuiSystem::Cleanup() { + if (!initialized) { + return; + } + + // Wait for the device to be idle before cleaning up + if (renderer) { + renderer->WaitIdle(); + } + // Destroy ImGui context + if (context) { + ImGui::DestroyContext(context); + context = nullptr; + } + + initialized = false; +} + +void ImGuiSystem::SetAudioSystem(AudioSystem* audioSystem) { + this->audioSystem = audioSystem; + + // Load the grass-step-right.wav file and create audio source + if (audioSystem) { + if (audioSystem->LoadAudio("../Assets/grass-step-right.wav", "grass_step")) { + audioSource = audioSystem->CreateAudioSource("grass_step"); + if (audioSource) { + audioSource->SetPosition(audioSourceX, audioSourceY, audioSourceZ); + audioSource->SetVolume(0.8f); + audioSource->SetLoop(true); + std::cout << "Audio source created and configured for HRTF demo" << std::endl; + } + } + + // Also create a debug ping source for testing + debugPingSource = audioSystem->CreateDebugPingSource("debug_ping"); + if (debugPingSource) { + debugPingSource->SetPosition(audioSourceX, audioSourceY, audioSourceZ); + debugPingSource->SetVolume(0.8f); + debugPingSource->SetLoop(true); + std::cout << "Debug ping source created for audio debugging" << std::endl; + } + } +} + +void ImGuiSystem::NewFrame() { + if (!initialized) { + return; + } + + // Reset the flag at the start of each frame + frameAlreadyRendered = false; + + ImGui::NewFrame(); + + // Loading overlay: show a fullscreen progress bar while the initial scene is loading. + // The bar resets between phases (Textures -> Physics -> AS -> Finalizing) so users + // don't stare at a 100% bar while the engine is still doing work. + if (renderer) { + const bool modelLoading = renderer->IsLoading(); + if (modelLoading) { + ImGuiIO& io = ImGui::GetIO(); + // Suppress right-click while loading + if (io.MouseDown[1]) + io.MouseDown[1] = false; + + const ImVec2 dispSize = io.DisplaySize; + + ImGui::SetNextWindowPos(ImVec2(0, 0)); + ImGui::SetNextWindowSize(dispSize); + ImGuiWindowFlags flags = ImGuiWindowFlags_NoTitleBar | + ImGuiWindowFlags_NoResize | + ImGuiWindowFlags_NoMove | + ImGuiWindowFlags_NoScrollbar | + ImGuiWindowFlags_NoCollapse | + ImGuiWindowFlags_NoSavedSettings | + ImGuiWindowFlags_NoBringToFrontOnFocus | + ImGuiWindowFlags_NoNav; + + if (ImGui::Begin("##LoadingOverlay", nullptr, flags)) { + ImGui::PushStyleVar(ImGuiStyleVar_WindowPadding, ImVec2(0, 0)); + // Center the progress elements + const float barWidth = dispSize.x * 0.8f; + const float barX = (dispSize.x - barWidth) * 0.5f; + const float barY = dispSize.y * 0.45f; + ImGui::SetCursorPos(ImVec2(barX, barY)); + ImGui::BeginGroup(); + + // Global monotonic progress for the main loading bar to avoid resets + float frac = renderer->GetGlobalLoadingProgress(); + auto phase = renderer->GetLoadingPhase(); + + ImGui::ProgressBar(frac, ImVec2(barWidth, 0.0f)); + ImGui::Dummy(ImVec2(0.0f, 10.0f)); + ImGui::SetCursorPosX(barX); + ImGui::Text("Loading: %s (%.1f%%)", renderer->GetLoadingPhaseName(), frac * 100.0f); + + if (phase == Renderer::LoadingPhase::Textures) { + const uint32_t scheduled = renderer->GetTextureTasksScheduled(); + const uint32_t completed = renderer->GetTextureTasksCompleted(); + ImGui::Text("Textures: %u/%u", completed, scheduled); + } else if (phase == Renderer::LoadingPhase::AccelerationStructures) { + const uint32_t done = renderer->GetASBuildItemsDone(); + const uint32_t total = renderer->GetASBuildItemsTotal(); + ImGui::Text("%s (%u/%u, %.1fs)", renderer->GetASBuildStage(), done, total, renderer->GetASBuildElapsedSeconds()); + } + ImGui::EndGroup(); + ImGui::PopStyleVar(); + } + ImGui::End(); + return; + } + } + + // --- Streaming status: small progress indicator in the upper-right --- + // Once the scene is visible, textures may continue streaming to the GPU. + // Show a compact progress bar in the top-right while there are still + // outstanding texture tasks, and hide it once everything is fully loaded. + if (renderer) { + const uint32_t uploadTotal = renderer->GetUploadJobsTotal(); + const uint32_t uploadDone = renderer->GetUploadJobsCompleted(); + const bool modelLoading = renderer->IsLoading(); + const bool showASBuild = renderer->ShouldShowASBuildProgressInUI(); + + // Acceleration structure build can happen after initial load completes. + // If it takes a long time, show a compact progress window. + if (!modelLoading && showASBuild) { + ImGuiIO& io = ImGui::GetIO(); + const ImVec2 dispSize = io.DisplaySize; + + const float windowWidth = std::min(320.0f, dispSize.x * 0.42f); + const float windowHeight = 90.0f; + const ImVec2 winPos(dispSize.x - windowWidth - 10.0f, 10.0f); + + ImGui::SetNextWindowPos(winPos, ImGuiCond_Always); + ImGui::SetNextWindowSize(ImVec2(windowWidth, windowHeight)); + ImGuiWindowFlags flags = ImGuiWindowFlags_NoResize | + ImGuiWindowFlags_NoMove | + ImGuiWindowFlags_NoCollapse | + ImGuiWindowFlags_NoSavedSettings; + + if (ImGui::Begin("##ASBuildStatus", nullptr, flags)) { + ImGui::Text("Building acceleration structures..."); + const float asFrac = renderer->GetASBuildProgress(); + ImGui::ProgressBar(asFrac, ImVec2(-1.0f, 0.0f)); + const uint32_t done = renderer->GetASBuildItemsDone(); + const uint32_t total = renderer->GetASBuildItemsTotal(); + ImGui::Text("%s (%u/%u, %.1fs)", + renderer->GetASBuildStage(), + done, + total, + renderer->GetASBuildElapsedSeconds()); + } + ImGui::End(); + } + + if (!modelLoading && uploadTotal > 0 && uploadDone < uploadTotal) { + ImGuiIO& io = ImGui::GetIO(); + const ImVec2 dispSize = io.DisplaySize; + + const float windowWidth = std::min(260.0f, dispSize.x * 0.35f); + const float windowHeight = 120.0f; + // If the AS build status window is visible, offset streaming window below it. + const float yBase = 10.0f + (showASBuild ? (90.0f + 10.0f) : 0.0f); + const ImVec2 winPos(dispSize.x - windowWidth - 10.0f, yBase); + + ImGui::SetNextWindowPos(winPos, ImGuiCond_Always); + ImGui::SetNextWindowSize(ImVec2(windowWidth, windowHeight)); + ImGuiWindowFlags flags = ImGuiWindowFlags_NoTitleBar | + ImGuiWindowFlags_NoResize | + ImGuiWindowFlags_NoMove | + ImGuiWindowFlags_NoScrollbar | + ImGuiWindowFlags_NoSavedSettings | + ImGuiWindowFlags_NoCollapse; + + if (ImGui::Begin("##StreamingTextures", nullptr, flags)) { + ImGui::TextUnformatted("Streaming textures to GPU"); + float frac = (uploadTotal > 0) ? (float) uploadDone / (float) uploadTotal : 0.0f; + ImGui::ProgressBar(frac, ImVec2(-1.0f, 0.0f)); + + // Perf counters + const double mbps = renderer->GetUploadThroughputMBps(); + const double avgMs = renderer->GetAverageUploadMs(); + const double totalMB = (double) renderer->GetBytesUploadedTotal() / (1024.0 * 1024.0); + ImGui::Text("Throughput: %.1f MB/s", mbps); + ImGui::SameLine(); + ImGui::Text("Avg upload: %.2f ms/tex", avgMs); + ImGui::Text("Total uploaded: %.1f MB", totalMB); + } + ImGui::End(); + } + } + + // Create HRTF Audio Control UI + ImGui::Begin("HRTF Audio Controls"); + ImGui::Text("3D Audio Position Control"); + + // Audio source selection + ImGui::Separator(); + ImGui::Text("Audio Source Selection:"); + + static bool useDebugPing = false; + if (ImGui::Checkbox("Use Debug Ping (800Hz sine wave)", &useDebugPing)) { + // Stop current audio + if (audioSource&& audioSource + + -> + IsPlaying() + ) { + audioSource->Stop(); + } + if (debugPingSource&& debugPingSource + + -> + IsPlaying() + ) { + debugPingSource->Stop(); + } + std::cout << "Switched to " << (useDebugPing ? "debug ping" : "file audio") << " source" << std::endl; + } + + // Display current audio source position + ImGui::Text("Audio Source Position: (%.2f, %.2f, %.2f)", audioSourceX, audioSourceY, audioSourceZ); + ImGui::Text("Current Source: %s", useDebugPing ? "Debug Ping (800Hz)" : "grass-step-right.wav"); + + // Directional control buttons + ImGui::Separator(); + ImGui::Text("Directional Controls:"); + + // Get current active source + AudioSource* currentSource = useDebugPing ? debugPingSource : audioSource; + + // Up button + if (ImGui::Button("Up")) { + audioSourceY += 0.5f; + if (currentSource) { + currentSource->SetPosition(audioSourceX, audioSourceY, audioSourceZ); + } + std::cout << (useDebugPing ? "Debug ping" : "Audio") << " moved up to (" << audioSourceX << ", " << audioSourceY << ", " << audioSourceZ << ")" << std::endl; + } + + // Left and Right buttons on same line + if (ImGui::Button("Left")) { + audioSourceX -= 0.5f; + if (currentSource) { + currentSource->SetPosition(audioSourceX, audioSourceY, audioSourceZ); + } + std::cout << (useDebugPing ? "Debug ping" : "Audio") << " moved left to (" << audioSourceX << ", " << audioSourceY << ", " << audioSourceZ << ")" << std::endl; + } + ImGui::SameLine(); + if (ImGui::Button("Right")) { + audioSourceX += 0.5f; + if (currentSource) { + currentSource->SetPosition(audioSourceX, audioSourceY, audioSourceZ); + } + std::cout << (useDebugPing ? "Debug ping" : "Audio") << " moved right to (" << audioSourceX << ", " << audioSourceY << ", " << audioSourceZ << ")" << std::endl; + } + + // Down button + if (ImGui::Button("Down")) { + audioSourceY -= 0.5f; + if (currentSource) { + currentSource->SetPosition(audioSourceX, audioSourceY, audioSourceZ); + } + std::cout << (useDebugPing ? "Debug ping" : "Audio") << " moved down to (" << audioSourceX << ", " << audioSourceY << ", " << audioSourceZ << ")" << std::endl; + } + + // Audio playback controls + ImGui::Separator(); + ImGui::Text("Playback Controls:"); + + // Play button + if (ImGui::Button("Play")) { + if (currentSource) { + currentSource->Play(); + if (audioSystem) { + audioSystem->FlushOutput(); + } + if (useDebugPing) { + std::cout << "Started playing debug ping (800Hz sine wave) with HRTF processing" << std::endl; + } else { + std::cout << "Started playing grass-step-right.wav with HRTF processing" << std::endl; + } + } else { + std::cout << "No audio source available - audio system not initialized" << std::endl; + } + } + ImGui::SameLine(); + + // Stop button + if (ImGui::Button("Stop")) { + if (currentSource) { + currentSource->Stop(); + if (useDebugPing) { + std::cout << "Stopped debug ping playback" << std::endl; + } else { + std::cout << "Stopped audio playback" << std::endl; + } + } + } + + // Additional info + ImGui::Separator(); + if (audioSystem&& audioSystem + + -> + IsHRTFEnabled() + ) { + ImGui::Text("HRTF Processing: ENABLED"); + ImGui::Text("Use directional buttons to move the audio source in 3D space"); + ImGui::Text("You should hear the audio move around you!"); + + // HRTF Processing Mode: GPU only (checkbox removed) + ImGui::Separator(); + ImGui::Text("HRTF Processing Mode:"); + ImGui::Text("Current Mode: Vulkan shader processing (GPU)"); + } + else { + ImGui::Text("HRTF Processing: DISABLED"); + } + + // Ball Debugging Controls + ImGui::Separator(); + ImGui::Text("Ball Debugging Controls:"); + + if (ImGui::Checkbox("Ball-Only Rendering", &ballOnlyRenderingEnabled)) { + std::cout << "Ball-only rendering " << (ballOnlyRenderingEnabled ? "enabled" : "disabled") << std::endl; + } + ImGui::SameLine(); + if (ImGui::Button("?##BallOnlyHelp")) { + // Help tooltip will be shown on hover + } + if (ImGui::IsItemHovered()) { + ImGui::SetTooltip("When enabled, only balls will be rendered.\nAll other geometry (bistro scene) will be hidden."); + } + + if (ImGui::Checkbox("Camera Track Ball", &cameraTrackingEnabled)) { + std::cout << "Camera tracking " << (cameraTrackingEnabled ? "enabled" : "disabled") << std::endl; + } + ImGui::SameLine(); + if (ImGui::Button("?##CameraTrackHelp")) { + // Help tooltip will be shown on hover + } + if (ImGui::IsItemHovered()) { + ImGui::SetTooltip("When enabled, camera will automatically\nfollow and look at the ball."); + } + + // Status display + if (ballOnlyRenderingEnabled) { + ImGui::Text("Status: Only balls are being rendered"); + } else { + ImGui::Text("Status: All geometry is being rendered"); + } + + if (cameraTrackingEnabled) { + ImGui::Text("Camera: Tracking ball automatically"); + } else { + ImGui::Text("Camera: Manual control (WASD + mouse)"); + } + + ImGui::End(); +} + +void ImGuiSystem::EndFrameWithoutRendering() { + if (!initialized || frameAlreadyRendered) { + return; + } + ImGui::EndFrame(); + frameAlreadyRendered = true; +} + +void ImGuiSystem::Render(vk::raii::CommandBuffer& commandBuffer, uint32_t frameIndex) { + if (!initialized) { + return; + } + + // End the frame and prepare for rendering + ImGui::Render(); + + // Update vertex and index buffers for this frame + updateBuffers(frameIndex); + + // Record rendering commands + ImDrawData* drawData = ImGui::GetDrawData(); + if (!drawData || drawData->CmdListsCount == 0) { + return; + } + + try { + // Bind the pipeline + commandBuffer.bindPipeline(vk::PipelineBindPoint::eGraphics, *pipeline); + + // Set viewport + vk::Viewport viewport; + viewport.width = ImGui::GetIO().DisplaySize.x; + viewport.height = ImGui::GetIO().DisplaySize.y; + viewport.minDepth = 0.0f; + viewport.maxDepth = 1.0f; + commandBuffer.setViewport(0, {viewport}); + + // Set push constants + struct PushConstBlock { + float scale[2]; + float translate[2]; + } pushConstBlock{}; + + pushConstBlock.scale[0] = 2.0f / ImGui::GetIO().DisplaySize.x; + pushConstBlock.scale[1] = 2.0f / ImGui::GetIO().DisplaySize.y; + pushConstBlock.translate[0] = -1.0f; + pushConstBlock.translate[1] = -1.0f; + + commandBuffer.pushConstants(*pipelineLayout, vk::ShaderStageFlagBits::eVertex, 0, pushConstBlock); + + // Bind vertex and index buffers for this frame + commandBuffer.bindVertexBuffers(0, *vertexBuffers[frameIndex], vk::DeviceSize{0}); + commandBuffer.bindIndexBuffer(*indexBuffers[frameIndex], 0, vk::IndexType::eUint16); + + // Render command lists + int vertexOffset = 0; + int indexOffset = 0; + + for (int i = 0; i < drawData->CmdListsCount; i++) { + const ImDrawList* cmdList = drawData->CmdLists[i]; + + for (int j = 0; j < cmdList->CmdBuffer.Size; j++) { + const ImDrawCmd* pcmd = &cmdList->CmdBuffer[j]; + + // Set scissor rectangle + vk::Rect2D scissor; + scissor.offset.x = std::max(static_cast(pcmd->ClipRect.x), 0); + scissor.offset.y = std::max(static_cast(pcmd->ClipRect.y), 0); + scissor.extent.width = static_cast(pcmd->ClipRect.z - pcmd->ClipRect.x); + scissor.extent.height = static_cast(pcmd->ClipRect.w - pcmd->ClipRect.y); + commandBuffer.setScissor(0, {scissor}); + + // Bind descriptor set (font texture) + commandBuffer.bindDescriptorSets(vk::PipelineBindPoint::eGraphics, *pipelineLayout, 0, {*descriptorSet}, {}); + + // Draw + commandBuffer.drawIndexed(pcmd->ElemCount, 1, indexOffset, vertexOffset, 0); + indexOffset += pcmd->ElemCount; + } + + vertexOffset += cmdList->VtxBuffer.Size; + } + } catch (const std::exception& e) { + std::cerr << "Failed to render ImGui: " << e.what() << std::endl; + } +} + +void ImGuiSystem::HandleMouse(float x, float y, uint32_t buttons) { + if (!initialized) { + return; + } + + ImGuiIO& io = ImGui::GetIO(); + + // Update mouse position + io.MousePos = ImVec2(x, y); + + // Update mouse buttons + io.MouseDown[0] = (buttons & 0x01) != 0; // Left button + io.MouseDown[1] = (buttons & 0x02) != 0; // Right button + io.MouseDown[2] = (buttons & 0x04) != 0; // Middle button +} + +void ImGuiSystem::HandleKeyboard(uint32_t key, bool pressed) { + if (!initialized) { + return; + } + + ImGuiIO& io = ImGui::GetIO(); + + // Update key state + if (key < 512) { + io.KeysDown[key] = pressed; + } + + // Update modifier keys + // Using GLFW key codes instead of Windows-specific VK_* constants + io.KeyCtrl = io.KeysDown[341] || io.KeysDown[345]; // Left/Right Control + io.KeyShift = io.KeysDown[340] || io.KeysDown[344]; // Left/Right Shift + io.KeyAlt = io.KeysDown[342] || io.KeysDown[346]; // Left/Right Alt + io.KeySuper = io.KeysDown[343] || io.KeysDown[347]; // Left/Right Super +} + +void ImGuiSystem::HandleChar(uint32_t c) { + if (!initialized) { + return; + } + + ImGuiIO& io = ImGui::GetIO(); + io.AddInputCharacter(c); +} + +void ImGuiSystem::HandleResize(uint32_t width, uint32_t height) { + if (!initialized) { + return; + } + + this->width = width; + this->height = height; + + ImGuiIO& io = ImGui::GetIO(); + io.DisplaySize = ImVec2(static_cast(width), static_cast(height)); +} + +bool ImGuiSystem::WantCaptureKeyboard() const { + if (!initialized) { + return false; + } + + return ImGui::GetIO().WantCaptureKeyboard; +} + +bool ImGuiSystem::WantCaptureMouse() const { + if (!initialized) { + return false; + } + + return ImGui::GetIO().WantCaptureMouse; +} + +bool ImGuiSystem::createResources() { + // Create all Vulkan resources needed for ImGui rendering + if (!createFontTexture()) { + return false; + } + + if (!createDescriptorSetLayout()) { + return false; + } + + if (!createDescriptorPool()) { + return false; + } + + if (!createDescriptorSet()) { + return false; + } + + if (!createPipelineLayout()) { + return false; + } + + if (!createPipeline()) { + return false; + } + + return true; +} + +bool ImGuiSystem::createFontTexture() { + // Get font texture from ImGui + ImGuiIO& io = ImGui::GetIO(); + unsigned char* fontData; + int texWidth, texHeight; + io.Fonts->GetTexDataAsRGBA32(&fontData, &texWidth, &texHeight); + vk::DeviceSize uploadSize = texWidth * texHeight * 4 * sizeof(char); + + try { + // Create the font image + vk::ImageCreateInfo imageInfo; + imageInfo.imageType = vk::ImageType::e2D; + imageInfo.format = vk::Format::eR8G8B8A8Unorm; + imageInfo.extent.width = static_cast(texWidth); + imageInfo.extent.height = static_cast(texHeight); + imageInfo.extent.depth = 1; + imageInfo.mipLevels = 1; + imageInfo.arrayLayers = 1; + imageInfo.samples = vk::SampleCountFlagBits::e1; + imageInfo.tiling = vk::ImageTiling::eOptimal; + imageInfo.usage = vk::ImageUsageFlagBits::eSampled | vk::ImageUsageFlagBits::eTransferDst; + imageInfo.sharingMode = vk::SharingMode::eExclusive; + imageInfo.initialLayout = vk::ImageLayout::eUndefined; + + const vk::raii::Device& device = renderer->GetRaiiDevice(); + fontImage = vk::raii::Image(device, imageInfo); + + // Allocate memory for the image + vk::MemoryRequirements memRequirements = fontImage.getMemoryRequirements(); + + vk::MemoryAllocateInfo allocInfo; + allocInfo.allocationSize = memRequirements.size; + allocInfo.memoryTypeIndex = renderer->FindMemoryType(memRequirements.memoryTypeBits, vk::MemoryPropertyFlagBits::eDeviceLocal); + + fontMemory = vk::raii::DeviceMemory(device, allocInfo); + fontImage.bindMemory(*fontMemory, 0); + + // Create a staging buffer for uploading the font data + vk::BufferCreateInfo bufferInfo; + bufferInfo.size = uploadSize; + bufferInfo.usage = vk::BufferUsageFlagBits::eTransferSrc; + bufferInfo.sharingMode = vk::SharingMode::eExclusive; + + vk::raii::Buffer stagingBuffer(device, bufferInfo); + + vk::MemoryRequirements stagingMemRequirements = stagingBuffer.getMemoryRequirements(); + + vk::MemoryAllocateInfo stagingAllocInfo; + stagingAllocInfo.allocationSize = stagingMemRequirements.size; + stagingAllocInfo.memoryTypeIndex = renderer->FindMemoryType(stagingMemRequirements.memoryTypeBits, + vk::MemoryPropertyFlagBits::eHostVisible | vk::MemoryPropertyFlagBits::eHostCoherent); + + vk::raii::DeviceMemory stagingBufferMemory(device, stagingAllocInfo); + stagingBuffer.bindMemory(*stagingBufferMemory, 0); + + // Copy font data to staging buffer + void* data = stagingBufferMemory.mapMemory(0, uploadSize); + memcpy(data, fontData, uploadSize); + stagingBufferMemory.unmapMemory(); + + // Transition image layout and copy data + renderer->TransitionImageLayout(*fontImage, + vk::Format::eR8G8B8A8Unorm, + vk::ImageLayout::eUndefined, + vk::ImageLayout::eTransferDstOptimal); + renderer->CopyBufferToImage(*stagingBuffer, + *fontImage, + static_cast(texWidth), + static_cast(texHeight)); + renderer->TransitionImageLayout(*fontImage, + vk::Format::eR8G8B8A8Unorm, + vk::ImageLayout::eTransferDstOptimal, + vk::ImageLayout::eShaderReadOnlyOptimal); + + // Staging buffer and memory will be automatically cleaned up by RAII + + // Create image view + vk::ImageViewCreateInfo viewInfo; + viewInfo.image = *fontImage; + viewInfo.viewType = vk::ImageViewType::e2D; + viewInfo.format = vk::Format::eR8G8B8A8Unorm; + viewInfo.subresourceRange.aspectMask = vk::ImageAspectFlagBits::eColor; + viewInfo.subresourceRange.baseMipLevel = 0; + viewInfo.subresourceRange.levelCount = 1; + viewInfo.subresourceRange.baseArrayLayer = 0; + viewInfo.subresourceRange.layerCount = 1; + + fontView = vk::raii::ImageView(device, viewInfo); + + // Create sampler + vk::SamplerCreateInfo samplerInfo; + samplerInfo.magFilter = vk::Filter::eLinear; + samplerInfo.minFilter = vk::Filter::eLinear; + samplerInfo.mipmapMode = vk::SamplerMipmapMode::eLinear; + samplerInfo.addressModeU = vk::SamplerAddressMode::eClampToEdge; + samplerInfo.addressModeV = vk::SamplerAddressMode::eClampToEdge; + samplerInfo.addressModeW = vk::SamplerAddressMode::eClampToEdge; + samplerInfo.mipLodBias = 0.0f; + samplerInfo.anisotropyEnable = VK_FALSE; + samplerInfo.maxAnisotropy = 1.0f; + samplerInfo.compareEnable = VK_FALSE; + samplerInfo.compareOp = vk::CompareOp::eAlways; + samplerInfo.minLod = 0.0f; + samplerInfo.maxLod = 0.0f; + samplerInfo.borderColor = vk::BorderColor::eFloatOpaqueWhite; + samplerInfo.unnormalizedCoordinates = VK_FALSE; + + fontSampler = vk::raii::Sampler(device, samplerInfo); + + return true; + } catch (const std::exception& e) { + std::cerr << "Failed to create font texture: " << e.what() << std::endl; + return false; + } +} + +bool ImGuiSystem::createDescriptorSetLayout() { + try { + vk::DescriptorSetLayoutBinding binding; + binding.descriptorType = vk::DescriptorType::eCombinedImageSampler; + binding.descriptorCount = 1; + binding.stageFlags = vk::ShaderStageFlagBits::eFragment; + binding.binding = 0; + + vk::DescriptorSetLayoutCreateInfo layoutInfo; + layoutInfo.bindingCount = 1; + layoutInfo.pBindings = &binding; + + const vk::raii::Device& device = renderer->GetRaiiDevice(); + descriptorSetLayout = vk::raii::DescriptorSetLayout(device, layoutInfo); + + return true; + } catch (const std::exception& e) { + std::cerr << "Failed to create descriptor set layout: " << e.what() << std::endl; + return false; + } +} + +bool ImGuiSystem::createDescriptorPool() { + try { + vk::DescriptorPoolSize poolSize; + poolSize.type = vk::DescriptorType::eCombinedImageSampler; + poolSize.descriptorCount = 1; + + vk::DescriptorPoolCreateInfo poolInfo; + poolInfo.flags = vk::DescriptorPoolCreateFlagBits::eFreeDescriptorSet; + poolInfo.maxSets = 1; + poolInfo.poolSizeCount = 1; + poolInfo.pPoolSizes = &poolSize; + + const vk::raii::Device& device = renderer->GetRaiiDevice(); + descriptorPool = vk::raii::DescriptorPool(device, poolInfo); + + return true; + } catch (const std::exception& e) { + std::cerr << "Failed to create descriptor pool: " << e.what() << std::endl; + return false; + } +} + +bool ImGuiSystem::createDescriptorSet() { + try { + vk::DescriptorSetAllocateInfo allocInfo; + allocInfo.descriptorPool = *descriptorPool; + allocInfo.descriptorSetCount = 1; + allocInfo.pSetLayouts = &(*descriptorSetLayout); + + const vk::raii::Device& device = renderer->GetRaiiDevice(); + vk::raii::DescriptorSets descriptorSets(device, allocInfo); + descriptorSet = std::move(descriptorSets[0]); // Store the first (and only) descriptor set + std::cout << "ImGui created descriptor set with handle: " << *descriptorSet << std::endl; + + // Update descriptor set + vk::DescriptorImageInfo imageInfo; + imageInfo.imageLayout = vk::ImageLayout::eShaderReadOnlyOptimal; + imageInfo.imageView = *fontView; + imageInfo.sampler = *fontSampler; + + vk::WriteDescriptorSet writeSet; + writeSet.dstSet = *descriptorSet; + writeSet.descriptorCount = 1; + writeSet.descriptorType = vk::DescriptorType::eCombinedImageSampler; + writeSet.pImageInfo = &imageInfo; + writeSet.dstBinding = 0; + + device.updateDescriptorSets({writeSet}, {}); + + return true; + } catch (const std::exception& e) { + std::cerr << "Failed to create descriptor set: " << e.what() << std::endl; + return false; + } +} + +bool ImGuiSystem::createPipelineLayout() { + try { + // Push constant range for the transformation matrix + vk::PushConstantRange pushConstantRange; + pushConstantRange.stageFlags = vk::ShaderStageFlagBits::eVertex; + pushConstantRange.offset = 0; + pushConstantRange.size = sizeof(float) * 4; // 2 floats for scale, 2 floats for translate + + // Create pipeline layout + vk::PipelineLayoutCreateInfo pipelineLayoutInfo; + pipelineLayoutInfo.setLayoutCount = 1; + pipelineLayoutInfo.pSetLayouts = &(*descriptorSetLayout); + pipelineLayoutInfo.pushConstantRangeCount = 1; + pipelineLayoutInfo.pPushConstantRanges = &pushConstantRange; + + const vk::raii::Device& device = renderer->GetRaiiDevice(); + pipelineLayout = vk::raii::PipelineLayout(device, pipelineLayoutInfo); + + return true; + } catch (const std::exception& e) { + std::cerr << "Failed to create pipeline layout: " << e.what() << std::endl; + return false; + } +} + +bool ImGuiSystem::createPipeline() { + try { + // Load shaders + vk::raii::ShaderModule shaderModule = renderer->CreateShaderModule("shaders/imgui.spv"); + + // Shader stage creation + vk::PipelineShaderStageCreateInfo vertShaderStageInfo; + vertShaderStageInfo.stage = vk::ShaderStageFlagBits::eVertex; + vertShaderStageInfo.module = *shaderModule; + vertShaderStageInfo.pName = "VSMain"; + + vk::PipelineShaderStageCreateInfo fragShaderStageInfo; + fragShaderStageInfo.stage = vk::ShaderStageFlagBits::eFragment; + fragShaderStageInfo.module = *shaderModule; + fragShaderStageInfo.pName = "PSMain"; + + std::array shaderStages = {vertShaderStageInfo, fragShaderStageInfo}; + + // Vertex input + vk::VertexInputBindingDescription bindingDescription; + bindingDescription.binding = 0; + bindingDescription.stride = sizeof(ImDrawVert); + bindingDescription.inputRate = vk::VertexInputRate::eVertex; + + std::array attributeDescriptions; + attributeDescriptions[0].binding = 0; + attributeDescriptions[0].location = 0; + attributeDescriptions[0].format = vk::Format::eR32G32Sfloat; + attributeDescriptions[0].offset = offsetof(ImDrawVert, pos); + + attributeDescriptions[1].binding = 0; + attributeDescriptions[1].location = 1; + attributeDescriptions[1].format = vk::Format::eR32G32Sfloat; + attributeDescriptions[1].offset = offsetof(ImDrawVert, uv); + + attributeDescriptions[2].binding = 0; + attributeDescriptions[2].location = 2; + attributeDescriptions[2].format = vk::Format::eR8G8B8A8Unorm; + attributeDescriptions[2].offset = offsetof(ImDrawVert, col); + + vk::PipelineVertexInputStateCreateInfo vertexInputInfo; + vertexInputInfo.vertexBindingDescriptionCount = 1; + vertexInputInfo.pVertexBindingDescriptions = &bindingDescription; + vertexInputInfo.vertexAttributeDescriptionCount = static_cast(attributeDescriptions.size()); + vertexInputInfo.pVertexAttributeDescriptions = attributeDescriptions.data(); + + // Input assembly + vk::PipelineInputAssemblyStateCreateInfo inputAssembly; + inputAssembly.topology = vk::PrimitiveTopology::eTriangleList; + inputAssembly.primitiveRestartEnable = VK_FALSE; + + // Viewport and scissor + vk::PipelineViewportStateCreateInfo viewportState; + viewportState.viewportCount = 1; + viewportState.scissorCount = 1; + viewportState.pViewports = nullptr; // Dynamic state + viewportState.pScissors = nullptr; // Dynamic state + + // Rasterization + vk::PipelineRasterizationStateCreateInfo rasterizer; + rasterizer.depthClampEnable = VK_FALSE; + rasterizer.rasterizerDiscardEnable = VK_FALSE; + rasterizer.polygonMode = vk::PolygonMode::eFill; + rasterizer.lineWidth = 1.0f; + rasterizer.cullMode = vk::CullModeFlagBits::eNone; + rasterizer.frontFace = vk::FrontFace::eCounterClockwise; + rasterizer.depthBiasEnable = VK_FALSE; + + // Multisampling + vk::PipelineMultisampleStateCreateInfo multisampling; + multisampling.sampleShadingEnable = VK_FALSE; + multisampling.rasterizationSamples = vk::SampleCountFlagBits::e1; + + // Depth and stencil testing + vk::PipelineDepthStencilStateCreateInfo depthStencil; + depthStencil.depthTestEnable = VK_FALSE; + depthStencil.depthWriteEnable = VK_FALSE; + depthStencil.depthCompareOp = vk::CompareOp::eLessOrEqual; + depthStencil.depthBoundsTestEnable = VK_FALSE; + depthStencil.stencilTestEnable = VK_FALSE; + + // Color blending + vk::PipelineColorBlendAttachmentState colorBlendAttachment; + colorBlendAttachment.colorWriteMask = + vk::ColorComponentFlagBits::eR | vk::ColorComponentFlagBits::eG | + vk::ColorComponentFlagBits::eB | vk::ColorComponentFlagBits::eA; + colorBlendAttachment.blendEnable = VK_TRUE; + colorBlendAttachment.srcColorBlendFactor = vk::BlendFactor::eSrcAlpha; + colorBlendAttachment.dstColorBlendFactor = vk::BlendFactor::eOneMinusSrcAlpha; + colorBlendAttachment.colorBlendOp = vk::BlendOp::eAdd; + colorBlendAttachment.srcAlphaBlendFactor = vk::BlendFactor::eOneMinusSrcAlpha; + colorBlendAttachment.dstAlphaBlendFactor = vk::BlendFactor::eZero; + colorBlendAttachment.alphaBlendOp = vk::BlendOp::eAdd; + + vk::PipelineColorBlendStateCreateInfo colorBlending; + colorBlending.logicOpEnable = VK_FALSE; + colorBlending.attachmentCount = 1; + colorBlending.pAttachments = &colorBlendAttachment; + + // Dynamic state + std::vector dynamicStates = { + vk::DynamicState::eViewport, + vk::DynamicState::eScissor + }; + + vk::PipelineDynamicStateCreateInfo dynamicState; + dynamicState.dynamicStateCount = static_cast(dynamicStates.size()); + dynamicState.pDynamicStates = dynamicStates.data(); + + vk::Format colorFormat = renderer->GetSwapChainImageFormat(); // Get the actual swapchain format + vk::Format depthFormat = renderer->findDepthFormat(); + vk::PipelineRenderingCreateInfo renderingInfo{ + .colorAttachmentCount = 1, + .pColorAttachmentFormats = &colorFormat, + .depthAttachmentFormat = depthFormat + }; + + vk::GraphicsPipelineCreateInfo pipelineInfo{ + .pNext = &renderingInfo, + .stageCount = static_cast(shaderStages.size()), + .pStages = shaderStages.data(), + .pVertexInputState = &vertexInputInfo, + .pInputAssemblyState = &inputAssembly, + .pViewportState = &viewportState, + .pRasterizationState = &rasterizer, + .pMultisampleState = &multisampling, + .pDepthStencilState = &depthStencil, + .pColorBlendState = &colorBlending, + .pDynamicState = &dynamicState, + .layout = *pipelineLayout, + .basePipelineHandle = nullptr + }; + + const vk::raii::Device& device = renderer->GetRaiiDevice(); + pipeline = vk::raii::Pipeline(device, nullptr, pipelineInfo); + return true; + } catch (const std::exception& e) { + std::cerr << "Failed to create graphics pipeline: " << e.what() << std::endl; + return false; + } +} + +void ImGuiSystem::updateBuffers(uint32_t frameIndex) { + ImDrawData* drawData = ImGui::GetDrawData(); + if (!drawData || drawData->CmdListsCount == 0) { + return; + } + + try { + const vk::raii::Device& device = renderer->GetRaiiDevice(); + + // Calculate required buffer sizes + vk::DeviceSize vertexBufferSize = drawData->TotalVtxCount * sizeof(ImDrawVert); + vk::DeviceSize indexBufferSize = drawData->TotalIdxCount * sizeof(ImDrawIdx); + + // Resize buffers if needed for this frame + if (frameIndex >= vertexCounts.size()) + return; // Safety + + if (static_cast(drawData->TotalVtxCount) > vertexCounts[frameIndex]) { + // Clean up old buffer + vertexBuffers[frameIndex] = vk::raii::Buffer(nullptr); + vertexBufferMemories[frameIndex] = vk::raii::DeviceMemory(nullptr); + + // Create new vertex buffer + vk::BufferCreateInfo bufferInfo; + bufferInfo.size = vertexBufferSize; + bufferInfo.usage = vk::BufferUsageFlagBits::eVertexBuffer; + bufferInfo.sharingMode = vk::SharingMode::eExclusive; + + vertexBuffers[frameIndex] = vk::raii::Buffer(device, bufferInfo); + + vk::MemoryRequirements memRequirements = vertexBuffers[frameIndex].getMemoryRequirements(); + + vk::MemoryAllocateInfo allocInfo; + allocInfo.allocationSize = memRequirements.size; + allocInfo.memoryTypeIndex = renderer->FindMemoryType(memRequirements.memoryTypeBits, + vk::MemoryPropertyFlagBits::eHostVisible | vk::MemoryPropertyFlagBits::eHostCoherent); + + vertexBufferMemories[frameIndex] = vk::raii::DeviceMemory(device, allocInfo); + vertexBuffers[frameIndex].bindMemory(*vertexBufferMemories[frameIndex], 0); + vertexCounts[frameIndex] = drawData->TotalVtxCount; + } + + if (static_cast(drawData->TotalIdxCount) > indexCounts[frameIndex]) { + // Clean up old buffer + indexBuffers[frameIndex] = vk::raii::Buffer(nullptr); + indexBufferMemories[frameIndex] = vk::raii::DeviceMemory(nullptr); + + // Create new index buffer + vk::BufferCreateInfo bufferInfo; + bufferInfo.size = indexBufferSize; + bufferInfo.usage = vk::BufferUsageFlagBits::eIndexBuffer; + bufferInfo.sharingMode = vk::SharingMode::eExclusive; + + indexBuffers[frameIndex] = vk::raii::Buffer(device, bufferInfo); + + vk::MemoryRequirements memRequirements = indexBuffers[frameIndex].getMemoryRequirements(); + + vk::MemoryAllocateInfo allocInfo; + allocInfo.allocationSize = memRequirements.size; + allocInfo.memoryTypeIndex = renderer->FindMemoryType(memRequirements.memoryTypeBits, + vk::MemoryPropertyFlagBits::eHostVisible | vk::MemoryPropertyFlagBits::eHostCoherent); + + indexBufferMemories[frameIndex] = vk::raii::DeviceMemory(device, allocInfo); + indexBuffers[frameIndex].bindMemory(*indexBufferMemories[frameIndex], 0); + indexCounts[frameIndex] = drawData->TotalIdxCount; + } + + // Upload data to buffers for this frame (only if we have data to upload) + if (drawData->TotalVtxCount > 0 && drawData->TotalIdxCount > 0) { + void* vtxMappedMemory = vertexBufferMemories[frameIndex].mapMemory(0, vertexBufferSize); + void* idxMappedMemory = indexBufferMemories[frameIndex].mapMemory(0, indexBufferSize); + + ImDrawVert* vtxDst = static_cast(vtxMappedMemory); + ImDrawIdx* idxDst = static_cast(idxMappedMemory); + + for (int n = 0; n < drawData->CmdListsCount; n++) { + const ImDrawList* cmdList = drawData->CmdLists[n]; + memcpy(vtxDst, cmdList->VtxBuffer.Data, cmdList->VtxBuffer.Size * sizeof(ImDrawVert)); + memcpy(idxDst, cmdList->IdxBuffer.Data, cmdList->IdxBuffer.Size * sizeof(ImDrawIdx)); + vtxDst += cmdList->VtxBuffer.Size; + idxDst += cmdList->IdxBuffer.Size; + } + + vertexBufferMemories[frameIndex].unmapMemory(); + indexBufferMemories[frameIndex].unmapMemory(); + } + } catch (const std::exception& e) { + std::cerr << "Failed to update buffers: " << e.what() << std::endl; + } +} diff --git a/attachments/sync2_engine/imgui_system.h b/attachments/sync2_engine/imgui_system.h new file mode 100644 index 00000000..aa6b6ece --- /dev/null +++ b/attachments/sync2_engine/imgui_system.h @@ -0,0 +1,265 @@ +/* Copyright (c) 2025 Holochip Corporation + * + * SPDX-License-Identifier: Apache-2.0 + * + * Licensed under the Apache License, Version 2.0 the "License"; + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +#pragma once + +#include +#include +#include +#include +#include +#include + +// Forward declarations +class Renderer; +class AudioSystem; +class AudioSource; +struct ImGuiContext; + +/** + * @brief Class for managing ImGui integration with Vulkan. + * + * This class implements the ImGui integration as described in the GUI chapter: + * @see en/Building_a_Simple_Engine/GUI/02_imgui_setup.adoc + */ +class ImGuiSystem { + public: + /** + * @brief Default constructor. + */ + ImGuiSystem(); + + // Constructor-based initialization to replace separate Initialize() calls + ImGuiSystem(Renderer* renderer, uint32_t width, uint32_t height) { + if (!Initialize(renderer, width, height)) { + throw std::runtime_error("ImGuiSystem: initialization failed"); + } + } + + /** + * @brief Destructor for proper cleanup. + */ + ~ImGuiSystem(); + + /** + * @brief Clean up ImGui resources. + */ + void Cleanup(); + + /** + * @brief Start a new ImGui frame. + */ + void NewFrame(); + + /** + * @brief Render the ImGui frame. + * @param commandBuffer The command buffer to record rendering commands to. + */ + void Render(vk::raii::CommandBuffer& commandBuffer, uint32_t frameIndex); + + /** + * @brief Handle mouse input. + * @param x The x-coordinate of the mouse. + * @param y The y-coordinate of the mouse. + * @param buttons The state of the mouse buttons. + */ + void HandleMouse(float x, float y, uint32_t buttons); + + /** + * @brief Handle keyboard input. + * @param key The key code. + * @param pressed Whether the key was pressed or released. + */ + void HandleKeyboard(uint32_t key, bool pressed); + + /** + * @brief Handle character input. + * @param c The character. + */ + void HandleChar(uint32_t c); + + /** + * @brief Handle window resize. + * @param width The new width of the window. + * @param height The new height of the window. + */ + void HandleResize(uint32_t width, uint32_t height); + + /** + * @brief Check if ImGui wants to capture keyboard input. + * @return True if ImGui wants to capture keyboard input, false otherwise. + */ + bool WantCaptureKeyboard() const; + + /** + * @brief Check if ImGui wants to capture mouse input. + * @return True if ImGui wants to capture mouse input, false otherwise. + */ + bool WantCaptureMouse() const; + + /** + * @brief Check if ImGui has already been rendered for the current frame. + * @return True if Render() was already called in NewFrame(), false otherwise. + */ + void EndFrameWithoutRendering(); + bool IsFrameRendered() const { return frameAlreadyRendered; } + + /** + * @brief Set the audio system reference for audio controls. + * @param audioSystem Pointer to the audio system. + */ + void SetAudioSystem(AudioSystem* audioSystem); + + /** + * @brief Get the current PBR rendering state. + * @return True if PBR rendering is enabled, false otherwise. + */ + bool IsPBREnabled() const { + return pbrEnabled; + } + + /** + * @brief Get the current ball-only rendering state. + * @return True if ball-only rendering is enabled, false otherwise. + */ + bool IsBallOnlyRenderingEnabled() const { + return ballOnlyRenderingEnabled; + } + + /** + * @brief Get the current camera tracking state. + * @return True if camera tracking is enabled, false otherwise. + */ + bool IsCameraTrackingEnabled() const { + return cameraTrackingEnabled; + } + void SetPBREnabled(bool pbr) { + pbrEnabled = pbr; + }; + + private: + /** + * @brief Initialize the ImGui system (called by constructor). + * @param renderer Pointer to the renderer. + * @param width The width of the window. + * @param height The height of the window. + * @return True if initialization was successful, false otherwise. + */ + bool Initialize(Renderer* renderer, uint32_t width, uint32_t height); + + // ImGui context + ImGuiContext* context = nullptr; + + // Renderer reference + Renderer* renderer = nullptr; + + // Audio system reference + AudioSystem* audioSystem = nullptr; + AudioSource* audioSource = nullptr; + AudioSource* debugPingSource = nullptr; + + // Audio position tracking + float audioSourceX = 1.0f; + float audioSourceY = 0.0f; + float audioSourceZ = 0.0f; + + // Vulkan resources + vk::raii::DescriptorPool descriptorPool = nullptr; + vk::raii::DescriptorSetLayout descriptorSetLayout = nullptr; + vk::raii::DescriptorSet descriptorSet = nullptr; + vk::raii::PipelineLayout pipelineLayout = nullptr; + vk::raii::Pipeline pipeline = nullptr; + vk::raii::Sampler fontSampler = nullptr; + vk::raii::Image fontImage = nullptr; + vk::raii::DeviceMemory fontMemory = nullptr; + vk::raii::ImageView fontView = nullptr; + // Per-frame dynamic buffers to avoid GPU/CPU contention when frames are in flight + std::vector vertexBuffers; + std::vector vertexBufferMemories; + std::vector indexBuffers; + std::vector indexBufferMemories; + std::vector vertexCounts; + std::vector indexCounts; + + // Window dimensions + uint32_t width = 0; + uint32_t height = 0; + + // Mouse state + float mouseX = 0.0f; + float mouseY = 0.0f; + uint32_t mouseButtons = 0; + + // Initialization flag + bool initialized = false; + + // PBR rendering state + bool pbrEnabled = true; + + // Ball-only rendering and camera tracking state + bool ballOnlyRenderingEnabled = false; + bool cameraTrackingEnabled = false; + + // Track if ImGui::Render() was already called in NewFrame() (during loading overlay) + bool frameAlreadyRendered = false; + + /** + * @brief Create Vulkan resources for ImGui. + * @return True if creation was successful, false otherwise. + */ + bool createResources(); + + /** + * @brief Create font texture. + * @return True if creation was successful, false otherwise. + */ + bool createFontTexture(); + + /** + * @brief Create descriptor set layout. + * @return True if creation was successful, false otherwise. + */ + bool createDescriptorSetLayout(); + + /** + * @brief Create descriptor pool. + * @return True if creation was successful, false otherwise. + */ + bool createDescriptorPool(); + + /** + * @brief Create descriptor set. + * @return True if creation was successful, false otherwise. + */ + bool createDescriptorSet(); + + /** + * @brief Create pipeline layout. + * @return True if creation was successful, false otherwise. + */ + bool createPipelineLayout(); + + /** + * @brief Create pipeline. + * @return True if creation was successful, false otherwise. + */ + bool createPipeline(); + + /** + * @brief Update vertex and index buffers. + */ + void updateBuffers(uint32_t frameIndex); +}; diff --git a/attachments/sync2_engine/main.cpp b/attachments/sync2_engine/main.cpp new file mode 100644 index 00000000..600e62cf --- /dev/null +++ b/attachments/sync2_engine/main.cpp @@ -0,0 +1,149 @@ +/* Copyright (c) 2025 Holochip Corporation + * + * SPDX-License-Identifier: Apache-2.0 + * + * Licensed under the Apache License, Version 2.0 the "License"; + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +#include "camera_component.h" +#include "crash_reporter.h" +#include "engine.h" +#include "scene_loading.h" +#include "transform_component.h" + +#include +#include +#include +#include +#include +#include + +// Constants +constexpr int WINDOW_WIDTH = 800; +constexpr int WINDOW_HEIGHT = 600; +#if defined(NDEBUG) +constexpr bool ENABLE_VALIDATION_LAYERS = false; +#else +constexpr bool ENABLE_VALIDATION_LAYERS = true; +#endif + +/** + * @brief Set up a simple scene with a camera and some objects. + * @param engine The engine to set up the scene in. + */ +void SetupScene(Engine *engine) +{ + // Create a camera entity + Entity *cameraEntity = engine->CreateEntity("Camera"); + if (!cameraEntity) + { + throw std::runtime_error("Failed to create camera entity"); + } + + // Add a transform component to the camera + auto *cameraTransform = cameraEntity->AddComponent(); + cameraTransform->SetPosition(glm::vec3(0.0f, 0.0f, 3.0f)); + + // Add a camera component to the camera entity + auto *camera = cameraEntity->AddComponent(); + camera->SetAspectRatio(static_cast(WINDOW_WIDTH) / static_cast(WINDOW_HEIGHT)); + + // Set the camera as the active camera + engine->SetActiveCamera(camera); + + // Kick off GLTF model loading on a background thread so the main loop + // can start and render the UI/progress bar while the scene is being + // constructed. Engine::Update will avoid updating entities while + // loading is in progress to prevent data races. + if (auto *renderer = engine->GetRenderer()) + { + renderer->SetLoading(true); + // Defer switching to Textures phase until the first texture job is actually scheduled, + // so the overlay doesn’t sit at 0% with (0/0) for an extended period. + } + std::thread([engine] { + LoadGLTFModel(engine, "../Assets/bistro/bistro.gltf"); + }).detach(); +} + +#if defined(PLATFORM_ANDROID) +/** + * @brief Android entry point. + * @param app The Android app. + */ +void android_main(android_app *app) +{ + try + { + // Create the engine + Engine engine; + + // Initialize the engine + if (!engine.InitializeAndroid(app, "Simple Engine", ENABLE_VALIDATION_LAYERS)) + { + throw std::runtime_error("Failed to initialize engine"); + } + + // Set up the scene + SetupScene(&engine); + + // Run the engine + engine.RunAndroid(); + } + catch (const std::exception &e) + { + LOGE("Exception: %s", e.what()); + } +} +#else +/** + * @brief Desktop entry point. + * @return The exit code. + */ +int main(int argc, char *argv[]) +{ + try + { + std::vector args(argv, argv + argc); + bool debugSync = std::find(args.begin(), args.end(), "--debug-sync") != args.end(); + + // Enable minidump generation for Release-only crashes (e.g., stack cookie failures / fast-fail). + // Writes dumps under the current working directory (the build/run directory). + CrashReporter::GetInstance().Initialize("crashes", "SimpleEngine", "1.0.0"); + + // Create the engine + Engine engine; + + // Initialize the engine + if (!engine.Initialize("Sync2 Engine", WINDOW_WIDTH, WINDOW_HEIGHT, ENABLE_VALIDATION_LAYERS, debugSync)) + { + throw std::runtime_error("Failed to initialize engine"); + } + + // Set up the scene + SetupScene(&engine); + + // Run the engine + engine.Run(); + + CrashReporter::GetInstance().Cleanup(); + + return 0; + } + catch (const std::exception &e) + { + std::cerr << "Exception: " << e.what() << std::endl; + CrashReporter::GetInstance().Cleanup(); + return 1; + } +} +#endif diff --git a/attachments/sync2_engine/memory_pool.cpp b/attachments/sync2_engine/memory_pool.cpp new file mode 100644 index 00000000..e01730b3 --- /dev/null +++ b/attachments/sync2_engine/memory_pool.cpp @@ -0,0 +1,575 @@ +/* Copyright (c) 2025 Holochip Corporation + * + * SPDX-License-Identifier: Apache-2.0 + * + * Licensed under the Apache License, Version 2.0 the "License"; + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +#include "memory_pool.h" +#include +#include +#include +#include + +MemoryPool::MemoryPool(const vk::raii::Device& device, const vk::raii::PhysicalDevice& physicalDevice) : device(device), physicalDevice(physicalDevice) { +} + +MemoryPool::~MemoryPool() { + // RAII will handle cleanup automatically + std::lock_guard lock(poolMutex); + pools.clear(); +} + +bool MemoryPool::initialize() { + std::lock_guard lock(poolMutex); + + try { + // Configure default pool settings based on typical usage patterns + + // Vertex buffer pool: Large allocations, device-local (increased for large models like bistro) + configurePool( + PoolType::VERTEX_BUFFER, + 128 * 1024 * 1024, + // 128MB blocks + 4096, + // 4KB allocation units + vk::MemoryPropertyFlagBits::eDeviceLocal, + vk::MemoryAllocateFlagBits::eDeviceAddress); + + // Index buffer pool: Medium allocations, device-local (increased for large models like bistro) + configurePool( + PoolType::INDEX_BUFFER, + 64 * 1024 * 1024, + // 64MB blocks + 2048, + // 2KB allocation units + vk::MemoryPropertyFlagBits::eDeviceLocal, + vk::MemoryAllocateFlagBits::eDeviceAddress); + + // Uniform buffer pool: Small allocations, host-visible + // Use 1KB alignment (multiple of 64B nonCoherentAtomSize) to reduce search space + configurePool( + PoolType::UNIFORM_BUFFER, + 4 * 1024 * 1024, + // 4MB blocks + 1024, + // 1KB allocation units + vk::MemoryPropertyFlagBits::eHostVisible | vk::MemoryPropertyFlagBits::eHostCoherent, + vk::MemoryAllocateFlagBits::eDeviceAddress); + + // Staging buffer pool: Variable allocations, host-visible + // Use 1KB alignment (multiple of 64B nonCoherentAtomSize) to reduce search space + configurePool( + PoolType::STAGING_BUFFER, + 16 * 1024 * 1024, + // 16MB blocks + 1024, + // 1KB allocation units + vk::MemoryPropertyFlagBits::eHostVisible | vk::MemoryPropertyFlagBits::eHostCoherent, + vk::MemoryAllocateFlagBits::eDeviceAddress); + + // Texture image pool: Use moderate block sizes to reduce allocation failures on mid-range GPUs + configurePool( + PoolType::TEXTURE_IMAGE, + 64 * 1024 * 1024, + // 64MB blocks (smaller blocks reduce contiguous allocation pressure) + 4096, + // 4KB allocation units + vk::MemoryPropertyFlagBits::eDeviceLocal); + + return true; + } catch (const std::exception& e) { + std::cerr << "Failed to initialize memory pool: " << e.what() << std::endl; + return false; + } +} + +void MemoryPool::configurePool( + const PoolType poolType, + const vk::DeviceSize blockSize, + const vk::DeviceSize allocationUnit, + const vk::MemoryPropertyFlags properties, + const vk::MemoryAllocateFlags allocFlags) { + PoolConfig config; + config.blockSize = blockSize; + config.allocationUnit = allocationUnit; + config.properties = properties; + config.allocFlags = allocFlags; + + poolConfigs[poolType] = config; +} + +uint32_t MemoryPool::findMemoryType(const uint32_t typeFilter, const vk::MemoryPropertyFlags properties) const { + const vk::PhysicalDeviceMemoryProperties memProperties = physicalDevice.getMemoryProperties(); + + for (uint32_t i = 0; i < memProperties.memoryTypeCount; i++) { + if ((typeFilter & (1 << i)) && + (memProperties.memoryTypes[i].propertyFlags & properties) == properties) { + return i; + } + } + + throw std::runtime_error("Failed to find suitable memory type"); +} + +std::unique_ptr MemoryPool::createMemoryBlock(PoolType poolType, vk::DeviceSize size, vk::MemoryAllocateFlags allocFlags) { + auto configIt = poolConfigs.find(poolType); + if (configIt == poolConfigs.end()) { + throw std::runtime_error("Pool type not configured"); + } + + const PoolConfig& config = configIt->second; + + // Use the larger of the requested size or configured block size + const vk::DeviceSize blockSize = std::max(size, config.blockSize); + + // Create a dummy buffer to get memory requirements for the memory type + vk::BufferUsageFlags usage = vk::BufferUsageFlagBits::eVertexBuffer | vk::BufferUsageFlagBits::eIndexBuffer | + vk::BufferUsageFlagBits::eUniformBuffer | vk::BufferUsageFlagBits::eTransferSrc | + vk::BufferUsageFlagBits::eTransferDst; + + if (allocFlags & vk::MemoryAllocateFlagBits::eDeviceAddress) { + usage |= vk::BufferUsageFlagBits::eShaderDeviceAddress; + } + + vk::BufferCreateInfo bufferInfo{ + .size = blockSize, + .usage = usage, + .sharingMode = vk::SharingMode::eExclusive + }; + + vk::raii::Buffer dummyBuffer(device, bufferInfo); + vk::MemoryRequirements memRequirements = dummyBuffer.getMemoryRequirements(); + + uint32_t memoryTypeIndex = findMemoryType(memRequirements.memoryTypeBits, config.properties); + + // Allocate the memory block using the device-required size + vk::MemoryAllocateInfo allocInfo{ + .allocationSize = memRequirements.size, + .memoryTypeIndex = memoryTypeIndex + }; + + // Add allocation flags (e.g., VK_MEMORY_ALLOCATE_DEVICE_ADDRESS_BIT) if needed + vk::MemoryAllocateFlagsInfo flagsInfo{}; + if (allocFlags != vk::MemoryAllocateFlags{}) { + flagsInfo.flags = allocFlags; + allocInfo.pNext = &flagsInfo; + } + + // Create MemoryBlock with proper initialization to avoid default constructor issues + auto block = std::unique_ptr(new MemoryBlock{ + .memory = vk::raii::DeviceMemory(device, allocInfo), + .size = memRequirements.size, + .used = 0, + .memoryTypeIndex = memoryTypeIndex, + .isMapped = false, + .mappedPtr = nullptr, + .freeList = {}, + .allocationUnit = config.allocationUnit, + .allocFlags = allocFlags + }); + + // Map memory if it's host-visible + block->isMapped = (config.properties & vk::MemoryPropertyFlagBits::eHostVisible) != vk::MemoryPropertyFlags{}; + if (block->isMapped) { + block->mappedPtr = block->memory.mapMemory(0, memRequirements.size); + } else { + block->mappedPtr = nullptr; + } + + // Initialize a free list based on the actual allocated size + const size_t numUnits = static_cast(block->size / config.allocationUnit); + block->freeList.resize(numUnits, true); // All units initially free + + return block; +} + +std::unique_ptr MemoryPool::createMemoryBlockWithType(PoolType poolType, vk::DeviceSize size, uint32_t memoryTypeIndex, vk::MemoryAllocateFlags allocFlags) { + auto configIt = poolConfigs.find(poolType); + if (configIt == poolConfigs.end()) { + throw std::runtime_error("Pool type not configured"); + } + const PoolConfig& config = configIt->second; + + // Allocate the memory block with the exact requested size + vk::MemoryAllocateInfo allocInfo{ + .allocationSize = size, + .memoryTypeIndex = memoryTypeIndex + }; + + // Add allocation flags (e.g., VK_MEMORY_ALLOCATE_DEVICE_ADDRESS_BIT) if needed + vk::MemoryAllocateFlagsInfo flagsInfo{}; + if (allocFlags != vk::MemoryAllocateFlags{}) { + flagsInfo.flags = allocFlags; + allocInfo.pNext = &flagsInfo; + } + + // Determine properties from the chosen memory type + const auto memProps = physicalDevice.getMemoryProperties(); + if (memoryTypeIndex >= memProps.memoryTypeCount) { + throw std::runtime_error("Invalid memoryTypeIndex for createMemoryBlockWithType"); + } + const vk::MemoryPropertyFlags typeProps = memProps.memoryTypes[memoryTypeIndex].propertyFlags; + + auto block = std::unique_ptr(new MemoryBlock{ + .memory = vk::raii::DeviceMemory(device, allocInfo), + .size = size, + .used = 0, + .memoryTypeIndex = memoryTypeIndex, + .isMapped = false, + .mappedPtr = nullptr, + .freeList = {}, + .allocationUnit = config.allocationUnit + }); + + block->isMapped = (typeProps & vk::MemoryPropertyFlagBits::eHostVisible) != vk::MemoryPropertyFlags{}; + if (block->isMapped) { + block->mappedPtr = block->memory.mapMemory(0, size); + } + + const size_t numUnits = static_cast(block->size / config.allocationUnit); + block->freeList.resize(numUnits, true); + + return block; +} + +std::pair MemoryPool::findSuitableBlock(PoolType poolType, vk::DeviceSize size, vk::DeviceSize alignment) { + auto poolIt = pools.find(poolType); + if (poolIt == pools.end()) { + poolIt = pools.try_emplace(poolType).first; + } + + auto& poolBlocks = poolIt->second; + const PoolConfig& config = poolConfigs[poolType]; + + // Calculate required units (accounting for size alignment) + const vk::DeviceSize alignedSize = ((size + alignment - 1) / alignment) * alignment; + const size_t requiredUnits = static_cast((alignedSize + config.allocationUnit - 1) / config.allocationUnit); + + // Search existing blocks for sufficient free space with proper offset alignment + for (const auto& block : poolBlocks) { + const vk::DeviceSize unit = config.allocationUnit; + const size_t totalUnits = block->freeList.size(); + + size_t i = 0; + while (i < totalUnits) { + // Fast skip over occupied units + if (!block->freeList[i]) { + // Find the next free unit efficiently + auto it = std::find(block->freeList.begin() + i, block->freeList.end(), true); + if (it == block->freeList.end()) { + break; // No more free units in this block + } + i = std::distance(block->freeList.begin(), it); + } + + // Ensure starting unit produces an offset aligned to 'alignment' + vk::DeviceSize startOffset = static_cast(i) * unit; + if ((alignment > 0) && (startOffset % alignment != 0)) { + // Advance i to the next unit that aligns with 'alignment' + const vk::DeviceSize remainder = startOffset % alignment; + const vk::DeviceSize advanceBytes = alignment - remainder; + const size_t advanceUnits = static_cast((advanceBytes + unit - 1) / unit); + i += std::max(advanceUnits, 1); + continue; + } + + // From aligned i, check for consecutive free units + size_t consecutiveFree = 0; + size_t j = i; + while (j < totalUnits && block->freeList[j] && consecutiveFree < requiredUnits) { + ++consecutiveFree; + ++j; + } + + if (consecutiveFree >= requiredUnits) { + return {block.get(), i}; + } + + // If we found a 'false' at 'j', skip past it! + // If we didn't find enough 'true's but j is still totalUnits, we're done with this block. + i = (j > i) ? j : (i + 1); + } + } + + // No suitable block found; create a new one on demand (no hard limits, allowed during rendering) + try { + auto newBlock = createMemoryBlock(poolType, alignedSize, config.allocFlags); + auto& poolBlocks = pools[poolType]; + poolBlocks.push_back(std::move(newBlock)); + std::cout << "Created new memory block (pool type: " + << static_cast(poolType) << ")" << std::endl; + return {poolBlocks.back().get(), 0}; + } catch (const std::exception& e) { + std::cerr << "Failed to create new memory block: " << e.what() << std::endl; + return {nullptr, 0}; + } +} + +std::unique_ptr MemoryPool::allocate(PoolType poolType, vk::DeviceSize size, vk::DeviceSize alignment) { + std::lock_guard lock(poolMutex); + + auto [block, startUnit] = findSuitableBlock(poolType, size, alignment); + if (!block) { + return nullptr; + } + + const PoolConfig& config = poolConfigs[poolType]; + + // Calculate required units (accounting for alignment) + const vk::DeviceSize alignedSize = ((size + alignment - 1) / alignment) * alignment; + const size_t requiredUnits = (alignedSize + config.allocationUnit - 1) / config.allocationUnit; + + // Mark units as used + for (size_t i = startUnit; i < startUnit + requiredUnits; ++i) { + block->freeList[i] = false; + } + + // Create allocation info + auto allocation = std::make_unique(); + allocation->memory = *block->memory; + allocation->offset = startUnit * config.allocationUnit; + allocation->size = alignedSize; + allocation->memoryTypeIndex = block->memoryTypeIndex; + allocation->isMapped = block->isMapped; + allocation->mappedPtr = block->isMapped ? static_cast(block->mappedPtr) + allocation->offset : nullptr; + + block->used += alignedSize; + + return allocation; +} + +void MemoryPool::deallocate(std::unique_ptr allocation) { + if (!allocation) { + return; + } + + std::lock_guard lock(poolMutex); + + // Find the block that contains this allocation + for (auto& [poolType, poolBlocks] : pools) { + const PoolConfig& config = poolConfigs[poolType]; + + for (auto& block : poolBlocks) { + if (*block->memory == allocation->memory) { + // Calculate which units to free + size_t startUnit = allocation->offset / config.allocationUnit; + size_t numUnits = (allocation->size + config.allocationUnit - 1) / config.allocationUnit; + + // Mark units as free + for (size_t i = startUnit; i < startUnit + numUnits; ++i) { + block->freeList[i] = true; + } + + block->used -= allocation->size; + return; + } + } + } + + std::cerr << "Warning: Could not find memory block for deallocation" << std::endl; +} + +std::pair> MemoryPool::createBuffer( + const vk::DeviceSize size, + const vk::BufferUsageFlags usage, + const vk::MemoryPropertyFlags properties) { + // Determine a pool type based on usage and properties + PoolType poolType = PoolType::VERTEX_BUFFER; + + // Check for host-visible requirements first (for instance buffers and staging) + if (properties & vk::MemoryPropertyFlagBits::eHostVisible) { + poolType = PoolType::STAGING_BUFFER; + } else if (usage & vk::BufferUsageFlagBits::eVertexBuffer) { + poolType = PoolType::VERTEX_BUFFER; + } else if (usage & vk::BufferUsageFlagBits::eIndexBuffer) { + poolType = PoolType::INDEX_BUFFER; + } else if (usage & vk::BufferUsageFlagBits::eUniformBuffer) { + poolType = PoolType::UNIFORM_BUFFER; + } + + // Create the buffer + const vk::BufferCreateInfo bufferInfo{ + .size = size, + .usage = usage, + .sharingMode = vk::SharingMode::eExclusive + }; + + vk::raii::Buffer buffer(device, bufferInfo); + + // Get memory requirements + vk::MemoryRequirements memRequirements = buffer.getMemoryRequirements(); + + std::unique_ptr allocation; + + // Normal pooled allocation path for all buffers, including those with device address support. + // The pools are pre-configured with VK_MEMORY_ALLOCATE_DEVICE_ADDRESS_BIT where needed. + allocation = allocate(poolType, memRequirements.size, memRequirements.alignment); + if (!allocation) { + throw std::runtime_error("Failed to allocate memory from pool"); + } + + // Bind memory to buffer + buffer.bindMemory(allocation->memory, allocation->offset); + + return {std::move(buffer), std::move(allocation)}; +} + +std::pair> MemoryPool::createImage( + uint32_t width, + uint32_t height, + vk::Format format, + vk::ImageTiling tiling, + vk::ImageUsageFlags usage, + vk::MemoryPropertyFlags properties, + uint32_t mipLevels, + vk::SharingMode sharingMode, + const std::vector& queueFamilyIndices) { + // Create the image + vk::ImageCreateInfo imageInfo{ + .imageType = vk::ImageType::e2D, + .format = format, + .extent = {width, height, 1}, + .mipLevels = std::max(1u, mipLevels), + .arrayLayers = 1, + .samples = vk::SampleCountFlagBits::e1, + .tiling = tiling, + .usage = usage, + .sharingMode = sharingMode, + .initialLayout = vk::ImageLayout::eUndefined + }; + + // If concurrent sharing is requested, provide queue family indices + std::vector fam = queueFamilyIndices; + if (sharingMode == vk::SharingMode::eConcurrent && !fam.empty()) { + imageInfo.queueFamilyIndexCount = static_cast(fam.size()); + imageInfo.pQueueFamilyIndices = fam.data(); + } + + vk::raii::Image image(device, imageInfo); + + // Get memory requirements for this image + vk::MemoryRequirements memRequirements = image.getMemoryRequirements(); + + // Pick a memory type compatible with this image + uint32_t memoryTypeIndex = findMemoryType(memRequirements.memoryTypeBits, properties); + + // Create a dedicated memory block for this image with the exact type and size + std::unique_ptr allocation; { + std::lock_guard lock(poolMutex); + auto poolIt = pools.find(PoolType::TEXTURE_IMAGE); + if (poolIt == pools.end()) { + poolIt = pools.try_emplace(PoolType::TEXTURE_IMAGE).first; + } + auto& poolBlocks = poolIt->second; + auto block = createMemoryBlockWithType(PoolType::TEXTURE_IMAGE, memRequirements.size, memoryTypeIndex); + + // Prepare allocation that uses the new block from offset 0 + allocation = std::make_unique(); + allocation->memory = *block->memory; + allocation->offset = 0; + allocation->size = memRequirements.size; + allocation->memoryTypeIndex = memoryTypeIndex; + allocation->isMapped = block->isMapped; + allocation->mappedPtr = block->mappedPtr; + + // Mark the entire block as used + block->used = memRequirements.size; + const size_t units = block->freeList.size(); + for (size_t i = 0; i < units; ++i) { + block->freeList[i] = false; + } + + // Keep the block owned by the pool for lifetime management and deallocation support + poolBlocks.push_back(std::move(block)); + } + + // Bind memory to image + image.bindMemory(allocation->memory, allocation->offset); + + return {std::move(image), std::move(allocation)}; +} + +std::pair MemoryPool::getMemoryUsage(PoolType poolType) const { + std::lock_guard lock(poolMutex); + + auto poolIt = pools.find(poolType); + if (poolIt == pools.end()) { + return {0, 0}; + } + + auto [used, total] = std::accumulate( + poolIt->second.begin(), + poolIt->second.end(), + std::pair{0, 0}, + [](const auto& acc, const auto& block) { + return std::pair{acc.first + block->used, acc.second + block->size}; + }); + + return {used, total}; +} + +std::pair MemoryPool::getTotalMemoryUsage() const { + std::lock_guard lock(poolMutex); + + vk::DeviceSize totalUsed = 0; + vk::DeviceSize totalAllocated = 0; + + for (const auto& [poolType, poolBlocks] : pools) { + for (const auto& block : poolBlocks) { + totalUsed += block->used; + totalAllocated += block->size; + } + } + + return {totalUsed, totalAllocated}; +} + +bool MemoryPool::preAllocatePools() { + std::lock_guard lock(poolMutex); + + try { + std::cout << "Pre-allocating initial memory blocks for pools..." << std::endl; + + // Pre-allocate at least one block for each pool type + for (const auto& [poolType, config] : poolConfigs) { + auto poolIt = pools.find(poolType); + if (poolIt == pools.end()) { + poolIt = pools.try_emplace(poolType).first; + } + + auto& poolBlocks = poolIt->second; + if (poolBlocks.empty()) { + // Create initial block for this pool type + auto newBlock = createMemoryBlock(poolType, config.blockSize); + poolBlocks.push_back(std::move(newBlock)); + std::cout << " Pre-allocated block for pool type " << static_cast(poolType) << std::endl; + } + } + + std::cout << "Memory pool pre-allocation completed successfully" << std::endl; + return true; + } catch (const std::exception& e) { + std::cerr << "Failed to pre-allocate memory pools: " << e.what() << std::endl; + return false; + } +} + +void MemoryPool::setRenderingActive(bool active) { + std::lock_guard lock(poolMutex); + renderingActive = active; +} + +bool MemoryPool::isRenderingActive() const { + std::lock_guard lock(poolMutex); + return renderingActive; +} diff --git a/attachments/sync2_engine/memory_pool.h b/attachments/sync2_engine/memory_pool.h new file mode 100644 index 00000000..133aa7b0 --- /dev/null +++ b/attachments/sync2_engine/memory_pool.h @@ -0,0 +1,220 @@ +/* Copyright (c) 2025 Holochip Corporation + * + * SPDX-License-Identifier: Apache-2.0 + * + * Licensed under the Apache License, Version 2.0 the "License"; + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +#pragma once + +#include +#include +#include +#include +#include +#include +#include + +/** + * @brief Memory pool allocator for Vulkan resources + * + * This class implements a memory pool system to reduce memory fragmentation + * and improve allocation performance by pre-allocating large chunks of memory + * and sub-allocating from them. + */ +class MemoryPool +{ + public: + /** + * @brief Types of memory pools based on usage patterns + */ + enum class PoolType + { + VERTEX_BUFFER, // Device-local memory for vertex data + INDEX_BUFFER, // Device-local memory for index data + UNIFORM_BUFFER, // Host-visible memory for uniform data + STAGING_BUFFER, // Host-visible memory for staging operations + TEXTURE_IMAGE // Device-local memory for texture images + }; + + /** + * @brief Allocation information for a memory block + */ + struct Allocation + { + vk::DeviceMemory memory; // The underlying device memory + vk::DeviceSize offset; // Offset within the memory block + vk::DeviceSize size; // Size of the allocation + uint32_t memoryTypeIndex; // Memory type index + bool isMapped; // Whether the memory is persistently mapped + void *mappedPtr; // Mapped pointer (if applicable) + }; + + /** + * @brief Memory block within a pool + */ + struct MemoryBlock + { + vk::raii::DeviceMemory memory; // RAII wrapper for device memory + vk::DeviceSize size; // Total size of the block + vk::DeviceSize used; // Currently used bytes + uint32_t memoryTypeIndex; // Memory type index + bool isMapped; // Whether the block is mapped + void *mappedPtr; // Mapped pointer (if applicable) + std::vector freeList; // Free list for sub-allocations + vk::DeviceSize allocationUnit; // Size of each allocation unit + vk::MemoryAllocateFlags allocFlags; // Allocation flags for the block + }; + + private: + const vk::raii::Device &device; + const vk::raii::PhysicalDevice &physicalDevice; + vk::PhysicalDeviceMemoryProperties memPropsCache{}; + + // Pool configurations + struct PoolConfig + { + vk::DeviceSize blockSize; // Size of each memory block + vk::DeviceSize allocationUnit; // Minimum allocation unit + vk::MemoryPropertyFlags properties; // Memory properties + vk::MemoryAllocateFlags allocFlags; // Allocation flags + }; + + // Memory pools for different types + std::unordered_map>> pools; + std::unordered_map poolConfigs; + + // Thread safety + mutable std::mutex poolMutex; + + // Optional rendering state flag (no allocation restrictions enforced) + bool renderingActive = false; + + // Helper methods + uint32_t findMemoryType(uint32_t typeFilter, vk::MemoryPropertyFlags properties) const; + std::unique_ptr createMemoryBlock(PoolType poolType, vk::DeviceSize size, vk::MemoryAllocateFlags allocFlags = {}); + // Create a memory block with an explicit memory type index (used for images requiring a specific type) + std::unique_ptr createMemoryBlockWithType(PoolType poolType, vk::DeviceSize size, uint32_t memoryTypeIndex, vk::MemoryAllocateFlags allocFlags = {}); + std::pair findSuitableBlock(PoolType poolType, vk::DeviceSize size, vk::DeviceSize alignment); + + public: + /** + * @brief Constructor + * @param device Vulkan device + * @param physicalDevice Vulkan physical device + */ + MemoryPool(const vk::raii::Device &device, const vk::raii::PhysicalDevice &physicalDevice); + + /** + * @brief Destructor + */ + ~MemoryPool(); + + /** + * @brief Initialize the memory pool with default configurations + * @return True if initialization was successful + */ + bool initialize(); + + /** + * @brief Allocate memory from a specific pool + * @param poolType Type of pool to allocate from + * @param size Size of the allocation + * @param alignment Required alignment + * @return Allocation information, or nullptr if allocation failed + */ + std::unique_ptr allocate(PoolType poolType, vk::DeviceSize size, vk::DeviceSize alignment = 1); + + /** + * @brief Free a previously allocated memory block + * @param allocation The allocation to free + */ + void deallocate(std::unique_ptr allocation); + + /** + * @brief Create a buffer using pooled memory + * @param size Size of the buffer + * @param usage Buffer usage flags + * @param properties Memory properties + * @return Pair of buffer and allocation info + */ + std::pair> createBuffer( + vk::DeviceSize size, + vk::BufferUsageFlags usage, + vk::MemoryPropertyFlags properties); + + /** + * @brief Create an image using pooled memory + * @param width Image width + * @param height Image height + * @param format Image format + * @param tiling Image tiling + * @param usage Image usage flags + * @param properties Memory properties + * @return Pair of image and allocation info + */ + std::pair> createImage( + uint32_t width, + uint32_t height, + vk::Format format, + vk::ImageTiling tiling, + vk::ImageUsageFlags usage, + vk::MemoryPropertyFlags properties, + uint32_t mipLevels = 1, + vk::SharingMode sharingMode = vk::SharingMode::eExclusive, + const std::vector &queueFamilyIndices = {}); + + /** + * @brief Get memory usage statistics + * @param poolType Type of pool to query + * @return Pair of (used bytes, total bytes) + */ + std::pair getMemoryUsage(PoolType poolType) const; + + /** + * @brief Get total memory usage across all pools + * @return Pair of (used bytes, total bytes) + */ + std::pair getTotalMemoryUsage() const; + + /** + * @brief Configure a specific pool type + * @param poolType Type of pool to configure + * @param blockSize Size of each memory block + * @param allocationUnit Minimum allocation unit + * @param properties Memory properties + */ + void configurePool( + PoolType poolType, + vk::DeviceSize blockSize, + vk::DeviceSize allocationUnit, + vk::MemoryPropertyFlags properties, + vk::MemoryAllocateFlags allocFlags = {}); + + /** + * @brief Pre-allocate initial memory blocks for configured pools + * @return True if pre-allocation was successful + */ + bool preAllocatePools(); + + /** + * @brief Set rendering active state flag (informational only) + * @param active Whether rendering is currently active + */ + void setRenderingActive(bool active); + + /** + * @brief Check if rendering is currently active (informational only) + * @return True if rendering is active + */ + bool isRenderingActive() const; +}; diff --git a/attachments/sync2_engine/model_loader.cpp b/attachments/sync2_engine/model_loader.cpp new file mode 100644 index 00000000..3104d13f --- /dev/null +++ b/attachments/sync2_engine/model_loader.cpp @@ -0,0 +1,2022 @@ +/* Copyright (c) 2025 Holochip Corporation + * + * SPDX-License-Identifier: Apache-2.0 + * + * Licensed under the Apache License, Version 2.0 the "License"; + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +#include "model_loader.h" +#include "mesh_component.h" +#include "renderer.h" +#include +#include +#include +#include +#include +#include +#include + +#include "mikktspace.h" + +// This struct acts as a bridge between the C-style MikkTSpace callbacks +// and our C++ MaterialMesh vertex data. It's passed via the m_pUserData pointer. +struct MikkTSpaceInterface { + std::vector* vertices; + std::vector* indices; +}; + +// These static callback functions are required by the MikkTSpace library. +// They are defined here at file-scope so they are not part of the ModelLoader class. +static int getNumFaces(const SMikkTSpaceContext* pContext) { + auto* userData = static_cast(pContext->m_pUserData); + return static_cast(userData->indices->size() / 3); +} + +static int getNumVerticesOfFace(const SMikkTSpaceContext* pContext, const int iFace) { + return 3; +} + +static void getPosition(const SMikkTSpaceContext* pContext, float fvPosOut[], const int iFace, const int iVert) { + auto* userData = static_cast(pContext->m_pUserData); + uint32_t index = (*userData->indices)[iFace * 3 + iVert]; + const glm::vec3& pos = (*userData->vertices)[index].position; + fvPosOut[0] = pos.x; + fvPosOut[1] = pos.y; + fvPosOut[2] = pos.z; +} + +static void getNormal(const SMikkTSpaceContext* pContext, float fvNormOut[], const int iFace, const int iVert) { + auto* userData = static_cast(pContext->m_pUserData); + uint32_t index = (*userData->indices)[iFace * 3 + iVert]; + const glm::vec3& norm = (*userData->vertices)[index].normal; + fvNormOut[0] = norm.x; + fvNormOut[1] = norm.y; + fvNormOut[2] = norm.z; +} + +static void getTexCoord(const SMikkTSpaceContext* pContext, float fvTexcOut[], const int iFace, const int iVert) { + auto* userData = static_cast(pContext->m_pUserData); + uint32_t index = (*userData->indices)[iFace * 3 + iVert]; + const glm::vec2& uv = (*userData->vertices)[index].texCoord; + fvTexcOut[0] = uv.x; + fvTexcOut[1] = uv.y; +} + +static void setTSpaceBasic(const SMikkTSpaceContext* pContext, const float fvTangent[], const float fSign, const int iFace, const int iVert) { + auto* userData = static_cast(pContext->m_pUserData); + uint32_t index = (*userData->indices)[iFace * 3 + iVert]; + Vertex& vert = (*userData->vertices)[index]; + vert.tangent.x = fvTangent[0]; + vert.tangent.y = fvTangent[1]; + vert.tangent.z = fvTangent[2]; + // Clamp handedness to +/-1 to avoid tiny floating deviations + vert.tangent.w = (fSign >= 0.0f) ? 1.0f : -1.0f; +} + +// KTX2 decoding for GLTF images +#include + +// Helper: load KTX2 file from disk into RGBA8 CPU buffer +static bool LoadKTX2FileToRGBA(const std::string& filePath, std::vector& outData, int& width, int& height, int& channels) { + ktxTexture2* ktxTex = nullptr; + KTX_error_code result = ktxTexture2_CreateFromNamedFile(filePath.c_str(), KTX_TEXTURE_CREATE_LOAD_IMAGE_DATA_BIT, &ktxTex); + if (result != KTX_SUCCESS || !ktxTex) { + return false; + } + bool needsTranscode = ktxTexture2_NeedsTranscoding(ktxTex); + if (needsTranscode) { + result = ktxTexture2_TranscodeBasis(ktxTex, KTX_TTF_RGBA32, 0); + if (result != KTX_SUCCESS) { + ktxTexture_Destroy(reinterpret_cast(ktxTex)); + return false; + } + } + width = static_cast(ktxTex->baseWidth); + height = static_cast(ktxTex->baseHeight); + channels = 4; + ktx_size_t offset; + ktxTexture_GetImageOffset(reinterpret_cast(ktxTex), 0, 0, 0, &offset); + const uint8_t* levelData = ktxTexture_GetData(reinterpret_cast(ktxTex)) + offset; + size_t levelSize = needsTranscode ? static_cast(width) * static_cast(height) * 4 : ktxTexture_GetImageSize(reinterpret_cast(ktxTex), 0); + outData.resize(levelSize); + std::memcpy(outData.data(), levelData, levelSize); + ktxTexture_Destroy(reinterpret_cast(ktxTex)); + return true; +} + +// Emissive scaling factor to convert from Blender units to engine units +#define EMISSIVE_SCALE_FACTOR (1.0f / 638.0f) +#define LIGHT_SCALE_FACTOR (1.0f / 638.0f) + +ModelLoader::~ModelLoader() { + // Destructor implementation + models.clear(); + materials.clear(); +} + +bool ModelLoader::Initialize(Renderer* _renderer) { + renderer = _renderer; + + if (!renderer) { + std::cerr << "ModelLoader::Initialize: Renderer is null" << std::endl; + return false; + } + + return true; +} + +Model* ModelLoader::LoadGLTF(const std::string& filename) { + // Check if the model is already loaded + auto it = models.find(filename); + if (it != models.end()) { + return it->second.get(); + } + + // Create a new model + auto model = std::make_unique(filename); + + // Parse the GLTF file + if (!ParseGLTF(filename, model.get())) { + std::cerr << "ModelLoader::LoadGLTF: Failed to parse GLTF file: " << filename << std::endl; + return nullptr; + } + + // Store the model + models[filename] = std::move(model); + + return models[filename].get(); +} + +Model* ModelLoader::GetModel(const std::string& name) { + auto it = models.find(name); + if (it != models.end()) { + return it->second.get(); + } + return nullptr; +} + +// Static helper function to lowercase a string (ASCII only) +static std::string ToLower(const std::string& s) { + std::string out = s; + std::ranges::transform(out, + out.begin(), + [](unsigned char c) { return static_cast(std::tolower(c)); }); + return out; +} + +// Static helper function for loading KTX2 images in GLTF files +static bool LoadKTX2Image(tinygltf::Image* image, + const int image_idx, + std::string* err, + std::string* warn, + int req_width, + int req_height, + const unsigned char* bytes, + int size, + void* user_data) { + // Try KTX2 first using libktx + ktxTexture2* ktxTex = nullptr; + KTX_error_code result = ktxTexture2_CreateFromMemory(bytes, size, KTX_TEXTURE_CREATE_LOAD_IMAGE_DATA_BIT, &ktxTex); + if (result == KTX_SUCCESS && ktxTex) { + bool needsTranscode = ktxTexture2_NeedsTranscoding(ktxTex); + if (needsTranscode) { + result = ktxTexture2_TranscodeBasis(ktxTex, KTX_TTF_RGBA32, 0); + if (result != KTX_SUCCESS) { + if (err) + *err = "Failed to transcode KTX2 image: " + std::to_string(result); + ktxTexture_Destroy(reinterpret_cast(ktxTex)); + return false; + } + } + image->width = static_cast(ktxTex->baseWidth); + image->height = static_cast(ktxTex->baseHeight); + image->component = 4; + image->bits = 8; + image->pixel_type = TINYGLTF_COMPONENT_TYPE_UNSIGNED_BYTE; + + ktx_size_t offset; + ktxTexture_GetImageOffset(reinterpret_cast(ktxTex), 0, 0, 0, &offset); + const uint8_t* levelData = ktxTexture_GetData(reinterpret_cast(ktxTex)) + offset; + size_t levelSize = needsTranscode ? static_cast(image->width) * static_cast(image->height) * 4 : ktxTexture_GetImageSize(reinterpret_cast(ktxTex), 0); + image->image.resize(levelSize); + std::memcpy(image->image.data(), levelData, levelSize); + ktxTexture_Destroy(reinterpret_cast(ktxTex)); + return true; + } + + // Non-KTX images not supported by this loader per project simplification + if (err) { + *err = "Non-KTX2 images are not supported by the custom image loader (use KTX2)."; + } + return false; +} +void ModelLoader::ProcessMaterials(const tinygltf::Model& gltfModel, + const std::string& baseTexturePath, + std::set& loadedTextures) { + // Build/refresh an index -> material mapping that matches glTF material indices. + materialsByIndex.clear(); + materialsByIndex.resize(gltfModel.materials.size(), nullptr); + + // Process materials first + for (size_t i = 0; i < gltfModel.materials.size(); ++i) { + const auto& gltfMaterial = gltfModel.materials[i]; + + // Create PBR material + auto material = std::make_unique(gltfMaterial.name.empty() ? ("material_" + std::to_string(i)) : gltfMaterial.name); + + // Extract PBR properties + if (gltfMaterial.pbrMetallicRoughness.baseColorFactor.size() >= 3) { + material->albedo = glm::vec3( + gltfMaterial.pbrMetallicRoughness.baseColorFactor[0], + gltfMaterial.pbrMetallicRoughness.baseColorFactor[1], + gltfMaterial.pbrMetallicRoughness.baseColorFactor[2]); + if (gltfMaterial.pbrMetallicRoughness.baseColorFactor.size() >= 4) { + material->alpha = static_cast(gltfMaterial.pbrMetallicRoughness.baseColorFactor[3]); + } + } + material->metallic = static_cast(gltfMaterial.pbrMetallicRoughness.metallicFactor); + material->roughness = static_cast(gltfMaterial.pbrMetallicRoughness.roughnessFactor); + + if (gltfMaterial.emissiveFactor.size() >= 3) { + material->emissive = glm::vec3( + gltfMaterial.emissiveFactor[0], + gltfMaterial.emissiveFactor[1], + gltfMaterial.emissiveFactor[2]); + material->emissive *= light_scale; + } + + // Parse KHR_materials_emissive_strength extension + auto extensionIt = gltfMaterial.extensions.find("KHR_materials_emissive_strength"); + if (extensionIt != gltfMaterial.extensions.end()) { + hasEmissiveStrengthExtension = true; + const tinygltf::Value& extension = extensionIt->second; + if (extension.Has("emissiveStrength") && extension.Get("emissiveStrength").IsNumber()) { + material->emissiveStrength = static_cast(extension.Get("emissiveStrength").Get()); + } + } else { + material->emissiveStrength = 0.00058f; + } + + // Alpha mode / cutoff + material->alphaMode = gltfMaterial.alphaMode.empty() ? std::string("OPAQUE") : gltfMaterial.alphaMode; + material->alphaCutoff = static_cast(gltfMaterial.alphaCutoff); + + // Transmission (KHR_materials_transmission) + auto transIt = gltfMaterial.extensions.find("KHR_materials_transmission"); + if (transIt != gltfMaterial.extensions.end()) { + const tinygltf::Value& ext = transIt->second; + if (ext.Has("transmissionFactor") && ext.Get("transmissionFactor").IsNumber()) { + material->transmissionFactor = static_cast(ext.Get("transmissionFactor").Get()); + } + } + + // Classify obvious architectural glass and liquid materials for + // specialized rendering. This is a heuristic based primarily on + // material name. + { + std::string lowerName = ToLower(material->GetName()); + bool nameSuggestsGlass = + (lowerName.find("glass") != std::string::npos) || + (lowerName.find("window") != std::string::npos); + + bool probablyLiquid = + (lowerName.find("beer") != std::string::npos) || + (lowerName.find("wine") != std::string::npos) || + (lowerName.find("liquid") != std::string::npos); + + if (nameSuggestsGlass && !probablyLiquid) { + material->isGlass = true; + } + + if (probablyLiquid) { + material->isLiquid = true; + + // Slightly boost liquid visibility. + material->albedo *= 1.4f; + material->albedo = glm::clamp(material->albedo, glm::vec3(0.0f), glm::vec3(4.0f)); + + // Slightly reduce roughness so specular highlights from + // lights help liquids stand out. + material->roughness = glm::clamp(material->roughness * 0.8f, 0.0f, 1.0f); + + // Ensure the liquid is not fully transparent by default. + material->alpha = glm::clamp(material->alpha * 1.2f, 0.15f, 1.0f); + } + } + + // Specular-Glossiness (KHR_materials_pbrSpecularGlossiness) + auto sgIt = gltfMaterial.extensions.find("KHR_materials_pbrSpecularGlossiness"); + if (sgIt != gltfMaterial.extensions.end()) { + const tinygltf::Value& ext = sgIt->second; + material->useSpecularGlossiness = true; + // diffuseFactor -> albedo and alpha + if (ext.Has("diffuseFactor") && ext.Get("diffuseFactor").IsArray()) { + const auto& arr = ext.Get("diffuseFactor").Get(); + if (arr.size() >= 3) { + material->albedo = glm::vec3( + arr[0].IsNumber() ? static_cast(arr[0].Get()) : material->albedo.r, + arr[1].IsNumber() ? static_cast(arr[1].Get()) : material->albedo.g, + arr[2].IsNumber() ? static_cast(arr[2].Get()) : material->albedo.b); + if (arr.size() >= 4 && arr[3].IsNumber()) { + material->alpha = static_cast(arr[3].Get()); + } + } + } + // specularFactor (vec3) + if (ext.Has("specularFactor") && ext.Get("specularFactor").IsArray()) { + const auto& arr = ext.Get("specularFactor").Get(); + if (arr.size() >= 3) { + material->specularFactor = glm::vec3( + arr[0].IsNumber() ? static_cast(arr[0].Get()) : material->specularFactor.r, + arr[1].IsNumber() ? static_cast(arr[1].Get()) : material->specularFactor.g, + arr[2].IsNumber() ? static_cast(arr[2].Get()) : material->specularFactor.b); + } + } + // glossinessFactor (float) + if (ext.Has("glossinessFactor") && ext.Get("glossinessFactor").IsNumber()) { + material->glossinessFactor = static_cast(ext.Get("glossinessFactor").Get()); + } + + // Load diffuseTexture into albedoTexturePath if present + if (ext.Has("diffuseTexture") && ext.Get("diffuseTexture").IsObject()) { + const auto& diffObj = ext.Get("diffuseTexture"); + if (diffObj.Has("index") && diffObj.Get("index").IsInt()) { + int texIndex = diffObj.Get("index").Get(); + if (texIndex >= 0 && texIndex < static_cast(gltfModel.textures.size())) { + const auto& texture = gltfModel.textures[texIndex]; + int imageIndex = -1; + if (texture.source >= 0 && texture.source < static_cast(gltfModel.images.size())) { + imageIndex = texture.source; + } else { + auto extBasis = texture.extensions.find("KHR_texture_basisu"); + if (extBasis != texture.extensions.end()) { + const tinygltf::Value& e = extBasis->second; + if (e.Has("source") && e.Get("source").IsInt()) { + int src = e.Get("source").Get(); + if (src >= 0 && src < static_cast(gltfModel.images.size())) + imageIndex = src; + } + } + } + if (imageIndex >= 0) { + const auto& image = gltfModel.images[imageIndex]; + std::string textureId = "gltf_baseColor_" + std::to_string(texIndex); + if (loadedTextures.find(textureId) == loadedTextures.end()) { + if (!image.image.empty()) { + renderer->LoadTextureFromMemoryAsync(textureId, image.image.data(), image.width, image.height, image.component); + material->albedoTexturePath = textureId; + } else if (!image.uri.empty()) { + std::string filePath = baseTexturePath + image.uri; + renderer->LoadTextureAsync(filePath); + material->albedoTexturePath = filePath; + } + loadedTextures.insert(textureId); + } else { + if (!image.image.empty()) { + material->albedoTexturePath = textureId; + } else if (!image.uri.empty()) { + material->albedoTexturePath = baseTexturePath + image.uri; + } + } + } + } + } + } + // Load specularGlossinessTexture into specGlossTexturePath and mirror to metallicRoughnessTexturePath (binding 2) + if (ext.Has("specularGlossinessTexture") && ext.Get("specularGlossinessTexture").IsObject()) { + const auto& sgObj = ext.Get("specularGlossinessTexture"); + if (sgObj.Has("index") && sgObj.Get("index").IsInt()) { + int texIndex = sgObj.Get("index").Get(); + if (texIndex >= 0 && texIndex < static_cast(gltfModel.textures.size())) { + const auto& texture = gltfModel.textures[texIndex]; + if (texture.source >= 0 && texture.source < static_cast(gltfModel.images.size())) { + std::string textureId = "gltf_specGloss_" + std::to_string(texIndex); + const auto& image = gltfModel.images[texture.source]; + if (loadedTextures.find(textureId) == loadedTextures.end()) { + if (!image.image.empty()) { + // Embedded image data (already decoded by tinygltf image loader) + renderer->LoadTextureFromMemoryAsync(textureId, image.image.data(), image.width, image.height, image.component, false); + material->specGlossTexturePath = textureId; + material->metallicRoughnessTexturePath = textureId; // reuse binding 2 + } else if (!image.uri.empty()) { + // External KTX2 file: offload libktx decode + upload to renderer worker threads + std::string filePath = baseTexturePath + image.uri; + renderer->RegisterTextureAlias(textureId, filePath); + renderer->LoadTextureAsync(filePath); + material->specGlossTexturePath = textureId; + material->metallicRoughnessTexturePath = textureId; // reuse binding 2 + } + loadedTextures.insert(textureId); + } else { + material->specGlossTexturePath = textureId; + material->metallicRoughnessTexturePath = textureId; + } + } + } + } + } + } + + // Extract texture information and load embedded texture data + if (gltfMaterial.pbrMetallicRoughness.baseColorTexture.index >= 0) { + int texIndex = gltfMaterial.pbrMetallicRoughness.baseColorTexture.index; + if (texIndex < gltfModel.textures.size()) { + const auto& texture = gltfModel.textures[texIndex]; + int imageIndex = -1; + if (texture.source >= 0 && texture.source < gltfModel.images.size()) { + imageIndex = texture.source; + } else { + auto extIt = texture.extensions.find("KHR_texture_basisu"); + if (extIt != texture.extensions.end()) { + const tinygltf::Value& ext = extIt->second; + if (ext.Has("source") && ext.Get("source").IsInt()) { + int src = ext.Get("source").Get(); + if (src >= 0 && src < static_cast(gltfModel.images.size())) { + imageIndex = src; + } + } + } + } + if (imageIndex >= 0) { + std::string textureId = "gltf_baseColor_" + std::to_string(texIndex); + material->albedoTexturePath = textureId; + + // Load texture data (embedded or external) + if (loadedTextures.find(textureId) == loadedTextures.end()) { + const auto& image = gltfModel.images[imageIndex]; + if (!image.image.empty()) { + // Always use memory-based upload (KTX2 already decoded by SetImageLoader) + renderer->LoadTextureFromMemoryAsync(textureId, image.image.data(), image.width, image.height, image.component, true); + material->albedoTexturePath = textureId; + } else if (!image.uri.empty()) { + // Offload KTX2 file reading/upload to renderer thread pool + std::string filePath = baseTexturePath + image.uri; + renderer->RegisterTextureAlias(textureId, filePath); + renderer->LoadTextureAsync(filePath, true); + material->albedoTexturePath = textureId; + } else { + std::cerr << " Warning: No decoded image bytes for base color texture index " << texIndex << std::endl; + } + loadedTextures.insert(textureId); + } + } + } + } + + if (gltfMaterial.pbrMetallicRoughness.metallicRoughnessTexture.index >= 0) { + int texIndex = gltfMaterial.pbrMetallicRoughness.metallicRoughnessTexture.index; + if (texIndex < gltfModel.textures.size()) { + const auto& texture = gltfModel.textures[texIndex]; + if (texture.source >= 0 && texture.source < gltfModel.images.size()) { + std::string textureId = "gltf_texture_" + std::to_string(texIndex); + material->metallicRoughnessTexturePath = textureId; + + // Load texture data (embedded or external) + if (loadedTextures.find(textureId) == loadedTextures.end()) { + const auto& image = gltfModel.images[texture.source]; + if (!image.image.empty()) { + // Load embedded texture data asynchronously + renderer->LoadTextureFromMemoryAsync(textureId, image.image.data(), image.width, image.height, image.component); + } else if (!image.uri.empty()) { + // Offload KTX2 file reading/upload to renderer thread pool + std::string filePath = baseTexturePath + image.uri; + renderer->RegisterTextureAlias(textureId, filePath); + renderer->LoadTextureAsync(filePath); + material->metallicRoughnessTexturePath = textureId; + } else { + std::cerr << " Warning: No decoded bytes for metallic-roughness texture index " << texIndex << std::endl; + } + loadedTextures.insert(textureId); + } + } + } + } + + if (gltfMaterial.normalTexture.index >= 0) { + int texIndex = gltfMaterial.normalTexture.index; + if (texIndex < gltfModel.textures.size()) { + const auto& texture = gltfModel.textures[texIndex]; + int imageIndex = -1; + if (texture.source >= 0 && texture.source < gltfModel.images.size()) { + imageIndex = texture.source; + } else { + auto extIt = texture.extensions.find("KHR_texture_basisu"); + if (extIt != texture.extensions.end()) { + const tinygltf::Value& ext = extIt->second; + if (ext.Has("source") && ext.Get("source").IsInt()) { + int src = ext.Get("source").Get(); + if (src >= 0 && src < static_cast(gltfModel.images.size())) { + imageIndex = src; + } + } + } + } + if (imageIndex >= 0) { + std::string textureId = "gltf_texture_" + std::to_string(texIndex); + material->normalTexturePath = textureId; + + // Load texture data (embedded or external) + if (loadedTextures.find(textureId) == loadedTextures.end()) { + const auto& image = gltfModel.images[imageIndex]; + if (!image.image.empty()) { + renderer->LoadTextureFromMemoryAsync(textureId, image.image.data(), image.width, image.height, image.component); + material->normalTexturePath = textureId; + } else if (!image.uri.empty()) { + // Offload KTX2 file reading/upload to renderer thread pool + std::string filePath = baseTexturePath + image.uri; + renderer->RegisterTextureAlias(textureId, filePath); + renderer->LoadTextureAsync(filePath); + material->normalTexturePath = textureId; + } else { + std::cerr << " Warning: No decoded bytes for normal texture index " << texIndex << std::endl; + } + loadedTextures.insert(textureId); + } + } + } + } + + if (gltfMaterial.occlusionTexture.index >= 0) { + int texIndex = gltfMaterial.occlusionTexture.index; + if (texIndex < gltfModel.textures.size()) { + const auto& texture = gltfModel.textures[texIndex]; + if (texture.source >= 0 && texture.source < gltfModel.images.size()) { + std::string textureId = "gltf_texture_" + std::to_string(texIndex); + material->occlusionTexturePath = textureId; + + // Load texture data (embedded or external) + if (loadedTextures.find(textureId) == loadedTextures.end()) { + const auto& image = gltfModel.images[texture.source]; + if (!image.image.empty()) { + // Schedule embedded texture upload + renderer->LoadTextureFromMemoryAsync(textureId, image.image.data(), image.width, image.height, image.component); + } else if (!image.uri.empty()) { + // Offload KTX2 file reading/upload to renderer thread pool + std::string filePath = baseTexturePath + image.uri; + renderer->RegisterTextureAlias(textureId, filePath); + renderer->LoadTextureAsync(filePath); + material->occlusionTexturePath = textureId; + } else { + std::cerr << " Warning: No decoded bytes for occlusion texture index " << texIndex << std::endl; + } + loadedTextures.insert(textureId); + } + } + } + } + + if (gltfMaterial.emissiveTexture.index >= 0) { + int texIndex = gltfMaterial.emissiveTexture.index; + if (texIndex < gltfModel.textures.size()) { + const auto& texture = gltfModel.textures[texIndex]; + if (texture.source >= 0 && texture.source < gltfModel.images.size()) { + std::string textureId = "gltf_texture_" + std::to_string(texIndex); + material->emissiveTexturePath = textureId; + + // Load texture data (embedded or external) + if (loadedTextures.find(textureId) == loadedTextures.end()) { + const auto& image = gltfModel.images[texture.source]; + if (!image.image.empty()) { + // Schedule embedded texture upload + renderer->LoadTextureFromMemoryAsync(textureId, image.image.data(), image.width, image.height, image.component); + } else if (!image.uri.empty()) { + // Offload KTX2 file reading/upload to renderer thread pool + std::string filePath = baseTexturePath + image.uri; + renderer->RegisterTextureAlias(textureId, filePath); + renderer->LoadTextureAsync(filePath); + material->emissiveTexturePath = textureId; + } else { + std::cerr << " Warning: No decoded bytes for emissive texture index " << texIndex << std::endl; + } + loadedTextures.insert(textureId); + } + } + } + } + + // Store the material + Material* rawPtr = material.get(); + materials[material->GetName()] = std::move(material); + if (i < materialsByIndex.size()) { + materialsByIndex[i] = rawPtr; + } + } + + // Handle KHR_materials_pbrSpecularGlossiness.diffuseTexture for baseColor when still missing + for (size_t i = 0; i < gltfModel.materials.size(); ++i) { + const auto& gltfMaterial = gltfModel.materials[i]; + std::string matName = gltfMaterial.name.empty() ? ("material_" + std::to_string(i)) : gltfMaterial.name; + auto matIt = materials.find(matName); + if (matIt == materials.end()) + continue; + Material* mat = matIt->second.get(); + if (!mat || !mat->albedoTexturePath.empty()) + continue; + auto extIt = gltfMaterial.extensions.find("KHR_materials_pbrSpecularGlossiness"); + if (extIt != gltfMaterial.extensions.end()) { + const tinygltf::Value& ext = extIt->second; + if (ext.Has("diffuseTexture") && ext.Get("diffuseTexture").IsObject()) { + const auto& diffObj = ext.Get("diffuseTexture"); + if (diffObj.Has("index") && diffObj.Get("index").IsInt()) { + int texIndex = diffObj.Get("index").Get(); + if (texIndex >= 0 && texIndex < static_cast(gltfModel.textures.size())) { + const auto& texture = gltfModel.textures[texIndex]; + int imageIndex = -1; + if (texture.source >= 0 && texture.source < static_cast(gltfModel.images.size())) { + imageIndex = texture.source; + } else { + auto extBasis = texture.extensions.find("KHR_texture_basisu"); + if (extBasis != texture.extensions.end()) { + const tinygltf::Value& e = extBasis->second; + if (e.Has("source") && e.Get("source").IsInt()) { + int src = e.Get("source").Get(); + if (src >= 0 && src < static_cast(gltfModel.images.size())) + imageIndex = src; + } + } + } + if (imageIndex >= 0) { + const auto& image = gltfModel.images[imageIndex]; + std::string texIdOrPath; + if (!image.uri.empty()) { + texIdOrPath = baseTexturePath + image.uri; + // Schedule async load; libktx decoding will occur on renderer worker threads + renderer->LoadTextureAsync(texIdOrPath, true); + mat->albedoTexturePath = texIdOrPath; + } + if (mat->albedoTexturePath.empty() && !image.image.empty()) { + // Upload embedded image data (already decoded via our image loader when KTX2) + texIdOrPath = "gltf_baseColor_" + std::to_string(texIndex); + renderer->LoadTextureFromMemoryAsync(texIdOrPath, image.image.data(), image.width, image.height, image.component, true); + mat->albedoTexturePath = texIdOrPath; + } + } + } + } + } + } + } + + // Heuristic pass: fill missing baseColor (albedo) by deriving from normal map filenames + // Many Bistro materials have no baseColorTexture index. When that happens, try inferring + // the base color from the normal map by replacing common suffixes like _ddna -> _d/_c/_diffuse/_basecolor/_albedo. + for (auto& kv : materials) { + auto& material = kv.second; + Material* mat = material.get(); + if (!mat) + continue; + if (!mat->albedoTexturePath.empty()) + continue; // already set + // Only attempt if we have an external normal texture path to derive from + if (mat->normalTexturePath.empty()) + continue; + const std::string& normalPath = mat->normalTexturePath; + // Skip embedded IDs like gltf_* which were already handled by memory uploads + if (normalPath.rfind("gltf_", 0) == 0) + continue; + + std::string candidateBase = normalPath; + std::string normalLower = candidateBase; + for (auto& ch : normalLower) + ch = static_cast(std::tolower(static_cast(ch))); + size_t pos = normalLower.find("_ddna"); + if (pos == std::string::npos) { + // Try a few additional normal suffixes seen in the wild + pos = normalLower.find("_n"); + } + if (pos != std::string::npos) { + static const char* suffixes[] = {"_d", "_c", "_cm", "_diffuse", "_basecolor", "_albedo"}; + for (const char* suf : suffixes) { + std::string cand = candidateBase; + cand.replace(pos, normalLower[pos] == '_' && normalLower.compare(pos, 5, "_ddna") == 0 ? 5 : 2, suf); + // Ensure the file exists before attempting to load + if (std::filesystem::exists(cand)) { + // Schedule async load; libktx decoding will occur on renderer worker threads + renderer->LoadTextureAsync(cand, true); + mat->albedoTexturePath = cand; + break; + } + } + } + } + + // Secondary heuristic: scan glTF images for base color by material-name match when still missing + for (auto& [materialName, materialPtr] : materials) { + Material* mat = materialPtr.get(); + if (!mat) + continue; + if (!mat->albedoTexturePath.empty()) + continue; // already resolved + // Try to find an image URI that looks like the base color for this material + std::string materialNameLower = materialName; + std::ranges::transform(materialNameLower, materialNameLower.begin(), [](unsigned char c) { return static_cast(std::tolower(c)); }); + for (const auto& image : gltfModel.images) { + if (image.uri.empty()) + continue; + std::string imageUri = image.uri; + std::string imageUriLower = imageUri; + std::ranges::transform(imageUriLower, imageUriLower.begin(), [](unsigned char c) { return static_cast(std::tolower(c)); }); + bool looksBase = imageUriLower.find("basecolor") != std::string::npos || + imageUriLower.find("albedo") != std::string::npos || + imageUriLower.find("diffuse") != std::string::npos; + if (!looksBase) + continue; + bool nameMatches = imageUriLower.find(materialNameLower) != std::string::npos; + if (!nameMatches) { + // Best-effort: try prefix of image name before '_' against material name + size_t underscore = imageUriLower.find('_'); + if (underscore != std::string::npos) { + std::string prefix = imageUriLower.substr(0, underscore); + nameMatches = materialNameLower.find(prefix) != std::string::npos; + } + } + if (!nameMatches) + continue; + + std::string textureId = baseTexturePath + imageUri; // use path string as ID for cache + if (!image.image.empty()) { + renderer->LoadTextureFromMemoryAsync(textureId, image.image.data(), image.width, image.height, image.component); + mat->albedoTexturePath = textureId; + break; + } else { + // Fallback: offload KTX2 file load to renderer threads + renderer->LoadTextureAsync(textureId); + mat->albedoTexturePath = textureId; + break; + } + } + } +} +void ModelLoader::ProcessCameras(const tinygltf::Model& gltfModel, Model* model) { + if (!gltfModel.cameras.empty()) { + std::cout << "Found " << gltfModel.cameras.size() << " camera(s) in GLTF file" << std::endl; + + for (size_t i = 0; i < gltfModel.cameras.size(); ++i) { + const auto& gltfCamera = gltfModel.cameras[i]; + std::cout << " Camera " << i << ": " << gltfCamera.name << std::endl; + + // Store camera data in the model for later use + CameraData cameraData; + cameraData.name = gltfCamera.name.empty() ? ("camera_" + std::to_string(i)) : gltfCamera.name; + + if (gltfCamera.type == "perspective") { + cameraData.isPerspective = true; + cameraData.fov = static_cast(gltfCamera.perspective.yfov); + cameraData.aspectRatio = static_cast(gltfCamera.perspective.aspectRatio); + cameraData.nearPlane = static_cast(gltfCamera.perspective.znear); + cameraData.farPlane = static_cast(gltfCamera.perspective.zfar); + std::cout << " Perspective camera: FOV=" << cameraData.fov + << ", Aspect=" << cameraData.aspectRatio + << ", Near=" << cameraData.nearPlane + << ", Far=" << cameraData.farPlane << std::endl; + } else if (gltfCamera.type == "orthographic") { + cameraData.isPerspective = false; + cameraData.orthographicSize = static_cast(gltfCamera.orthographic.ymag); + cameraData.nearPlane = static_cast(gltfCamera.orthographic.znear); + cameraData.farPlane = static_cast(gltfCamera.orthographic.zfar); + std::cout << " Orthographic camera: Size=" << cameraData.orthographicSize + << ", Near=" << cameraData.nearPlane + << ", Far=" << cameraData.farPlane << std::endl; + } + + // Find the node that uses this camera to get transform information + for (const auto& node : gltfModel.nodes) { + if (node.camera == static_cast(i)) { + // Extract transform from node + if (node.translation.size() == 3) { + cameraData.position = glm::vec3( + static_cast(node.translation[0]), + static_cast(node.translation[1]), + static_cast(node.translation[2])); + } + + if (node.rotation.size() == 4) { + cameraData.rotation = glm::quat( + static_cast(node.rotation[3]), + // w + static_cast(node.rotation[0]), + // x + static_cast(node.rotation[1]), + // y + static_cast(node.rotation[2]) // z + ); + } + + std::cout << " Position: (" << cameraData.position.x << ", " + << cameraData.position.y << ", " << cameraData.position.z << ")" << std::endl; + break; + } + } + + model->cameras.push_back(cameraData); + } + } +} +void ModelLoader::ProcessAnimations(const tinygltf::Model& gltfModel, Model* model) { + if (!gltfModel.animations.empty()) { + std::cout << "Found " << gltfModel.animations.size() << " animation(s) in GLTF file" << std::endl; + + std::vector parsedAnimations; + parsedAnimations.reserve(gltfModel.animations.size()); + + for (size_t animIdx = 0; animIdx < gltfModel.animations.size(); ++animIdx) { + const auto& gltfAnim = gltfModel.animations[animIdx]; + + Animation anim; + anim.name = gltfAnim.name.empty() ? ("animation_" + std::to_string(animIdx)) : gltfAnim.name; + + // Parse samplers + anim.samplers.reserve(gltfAnim.samplers.size()); + for (const auto& gltfSampler : gltfAnim.samplers) { + AnimationSampler sampler; + + // Parse interpolation type + if (gltfSampler.interpolation == "STEP") { + sampler.interpolation = AnimationInterpolation::Step; + } else if (gltfSampler.interpolation == "CUBICSPLINE") { + sampler.interpolation = AnimationInterpolation::CubicSpline; + } else { + sampler.interpolation = AnimationInterpolation::Linear; + } + + // Read input (time) accessor + if (gltfSampler.input >= 0 && gltfSampler.input < static_cast(gltfModel.accessors.size())) { + const auto& inputAccessor = gltfModel.accessors[gltfSampler.input]; + const auto& inputBufferView = gltfModel.bufferViews[inputAccessor.bufferView]; + const auto& inputBuffer = gltfModel.buffers[inputBufferView.buffer]; + + const float* inputData = reinterpret_cast( + &inputBuffer.data[inputBufferView.byteOffset + inputAccessor.byteOffset]); + + sampler.inputTimes.resize(inputAccessor.count); + for (size_t i = 0; i < inputAccessor.count; ++i) { + sampler.inputTimes[i] = inputData[i]; + } + } + + // Read output (value) accessor + if (gltfSampler.output >= 0 && gltfSampler.output < static_cast(gltfModel.accessors.size())) { + const auto& outputAccessor = gltfModel.accessors[gltfSampler.output]; + const auto& outputBufferView = gltfModel.bufferViews[outputAccessor.bufferView]; + const auto& outputBuffer = gltfModel.buffers[outputBufferView.buffer]; + + const float* outputData = reinterpret_cast( + &outputBuffer.data[outputBufferView.byteOffset + outputAccessor.byteOffset]); + + // Determine number of floats per element based on accessor type + size_t componentsPerElement = 1; + if (outputAccessor.type == TINYGLTF_TYPE_VEC3) { + componentsPerElement = 3; + } else if (outputAccessor.type == TINYGLTF_TYPE_VEC4) { + componentsPerElement = 4; + } + + size_t totalFloats = outputAccessor.count * componentsPerElement; + sampler.outputValues.resize(totalFloats); + for (size_t i = 0; i < totalFloats; ++i) { + sampler.outputValues[i] = outputData[i]; + } + } + + anim.samplers.push_back(std::move(sampler)); + } + + // Parse channels + anim.channels.reserve(gltfAnim.channels.size()); + for (const auto& gltfChannel : gltfAnim.channels) { + AnimationChannel channel; + channel.samplerIndex = gltfChannel.sampler; + channel.targetNode = gltfChannel.target_node; + + // Parse target path + if (gltfChannel.target_path == "translation") { + channel.path = AnimationPath::Translation; + } else if (gltfChannel.target_path == "rotation") { + channel.path = AnimationPath::Rotation; + } else if (gltfChannel.target_path == "scale") { + channel.path = AnimationPath::Scale; + } else if (gltfChannel.target_path == "weights") { + channel.path = AnimationPath::Weights; + } + + anim.channels.push_back(channel); + } + + std::cout << " Animation '" << anim.name << "': " + << anim.samplers.size() << " samplers, " + << anim.channels.size() << " channels, " + << "duration=" << anim.GetDuration() << "s" << std::endl; + + parsedAnimations.push_back(std::move(anim)); + } + + model->SetAnimations(parsedAnimations); + std::cout << "Loaded " << parsedAnimations.size() << " animations into model" << std::endl; + } +} + +bool ModelLoader::ParseGLTF(const std::string& filename, Model* model) { + std::string resolvedPath = renderer->ResolvePath(filename); + std::cout << "Parsing GLTF file: " << resolvedPath << " (original: " << filename << ")" << std::endl; + + // Extract the directory path from the model file to use as a base path for textures + std::filesystem::path modelPath(resolvedPath); + std::filesystem::path baseDir = std::filesystem::absolute(modelPath).parent_path(); + std::string baseTexturePath = baseDir.string(); + if (!baseTexturePath.empty() && baseTexturePath.back() != '/') { + baseTexturePath += "/"; + } + std::cout << "Using base texture path: " << baseTexturePath << std::endl; + + // Create tinygltf loader + tinygltf::Model gltfModel; + tinygltf::TinyGLTF loader; + std::string err; + std::string warn; + + // Set up image loader: prefer KTX2 via libktx; fallback to stb for other formats + loader.SetImageLoader(LoadKTX2Image, nullptr); + + // Load the GLTF file + bool ret = false; + if (resolvedPath.find(".glb") != std::string::npos) { + ret = loader.LoadBinaryFromFile(&gltfModel, &err, &warn, resolvedPath); + } else { + ret = loader.LoadASCIIFromFile(&gltfModel, &err, &warn, resolvedPath); + } + + if (!warn.empty()) { + std::cout << "GLTF Warning: " << warn << std::endl; + } + + if (!err.empty()) { + std::cerr << "GLTF Error: " << err << std::endl; + return false; + } + + if (!ret) { + std::cerr << "Failed to parse GLTF file: " << resolvedPath << " (original: " << filename << ")" << std::endl; + return false; + } + + // Extract mesh data from the first mesh (for now, we'll handle multiple meshes later) + if (gltfModel.meshes.empty()) { + std::cerr << "No meshes found in GLTF file" << std::endl; + return false; + } + + light_scale = 1.0f; + // Test if generator is blender and apply the blender factor see the issue here: https://github.com/KhronosGroup/glTF/issues/2473 + if (gltfModel.asset.generator.find("blender") != std::string::npos) { + std::cout << "Blender generator detected, applying blender factor" << std::endl; + light_scale = EMISSIVE_SCALE_FACTOR; + } + + // Track loaded textures to prevent loading the same texture multiple times + std::set loadedTextures; + + // Process materials first + ProcessMaterials(gltfModel, baseTexturePath, loadedTextures); + + // Process cameras from the GLTF file + ProcessCameras(gltfModel, model); + + // Process animations from the GLTF file + ProcessAnimations(gltfModel, model); + + // Collect all animated node indices from parsed animations + std::set animatedNodeIndices; + for (const auto& anim : model->GetAnimations()) { + for (const auto& channel : anim.channels) { + if (channel.targetNode >= 0) { + animatedNodeIndices.insert(channel.targetNode); + } + } + } + if (!animatedNodeIndices.empty()) { + std::cout << "[Animation] Found " << animatedNodeIndices.size() << " unique animated node(s)" << std::endl; + } + + // Process scene hierarchy to get node transforms for meshes + std::map> meshInstanceTransforms; // Map from mesh index to all instance transforms + std::unordered_map animatedNodeTransforms; // Map from animated node index to world transform + std::unordered_map animatedNodeMeshes; // Map from animated node index to mesh index + + // Helper function to calculate transform matrix from the GLTF node + auto calculateNodeTransform = [](const tinygltf::Node& node) -> glm::mat4 { + glm::mat4 transform; + + // Apply matrix if present + if (node.matrix.size() == 16) { + // GLTF matrices are column-major, the same as GLM + transform = glm::mat4( + node.matrix[0], + node.matrix[1], + node.matrix[2], + node.matrix[3], + node.matrix[4], + node.matrix[5], + node.matrix[6], + node.matrix[7], + node.matrix[8], + node.matrix[9], + node.matrix[10], + node.matrix[11], + node.matrix[12], + node.matrix[13], + node.matrix[14], + node.matrix[15]); + } else { + // Build transform from TRS components + glm::mat4 translation = glm::mat4(1.0f); + glm::mat4 rotation = glm::mat4(1.0f); + glm::mat4 scale = glm::mat4(1.0f); + + // Translation + if (node.translation.size() == 3) { + translation = glm::translate(glm::mat4(1.0f), + glm::vec3( + static_cast(node.translation[0]), + static_cast(node.translation[1]), + static_cast(node.translation[2]))); + } + + // Rotation (quaternion) + if (node.rotation.size() == 4) { + glm::quat quat( + static_cast(node.rotation[3]), + // w + static_cast(node.rotation[0]), + // x + static_cast(node.rotation[1]), + // y + static_cast(node.rotation[2]) // z + ); + rotation = glm::mat4_cast(quat); + } + + // Scale + if (node.scale.size() == 3) { + scale = glm::scale(glm::mat4(1.0f), + glm::vec3( + static_cast(node.scale[0]), + static_cast(node.scale[1]), + static_cast(node.scale[2]))); + } + + // Combine: T * R * S + transform = translation * rotation * scale; + } + + return transform; + }; + + // Recursive function to traverse scene hierarchy + std::function < void(int, const glm::mat4 &) > traverseNode = [&](int nodeIndex, const glm::mat4& parentTransform) { + if (nodeIndex < 0 || nodeIndex >= gltfModel.nodes.size()) { + return; + } + + const tinygltf::Node& node = gltfModel.nodes[nodeIndex]; + + // Calculate this node's transform + glm::mat4 nodeTransform = calculateNodeTransform(node); + glm::mat4 worldTransform = parentTransform * nodeTransform; + + // If this node has a mesh, add the transform to the instances list + if (node.mesh >= 0 && node.mesh < gltfModel.meshes.size()) { + meshInstanceTransforms[node.mesh].push_back(worldTransform); + } + + // If this node is animated, capture its world transform and mesh reference + if (animatedNodeIndices.contains(nodeIndex)) { + animatedNodeTransforms[nodeIndex] = worldTransform; + if (node.mesh >= 0) { + animatedNodeMeshes[nodeIndex] = node.mesh; + std::cout << "[Animation] Captured transform for animated node " << nodeIndex + << " (" << node.name << ") with mesh " << node.mesh << std::endl; + } else { + std::cout << "[Animation] Captured transform for animated node " << nodeIndex + << " (" << node.name << ") - no mesh" << std::endl; + } + } + + // Recursively process children + for (int childIndex : node.children) { + traverseNode(childIndex, worldTransform); + } + }; + + // Process all scenes (typically there's only one default scene) + if (!gltfModel.scenes.empty()) { + int defaultScene = gltfModel.defaultScene >= 0 ? gltfModel.defaultScene : 0; + if (defaultScene < gltfModel.scenes.size()) { + const tinygltf::Scene& scene = gltfModel.scenes[defaultScene]; + + // Traverse all root nodes in the scene + for (int rootNodeIndex : scene.nodes) { + traverseNode(rootNodeIndex, glm::mat4(1.0f)); + } + } + } + + // Store animated node transforms in the model for use by AnimationComponent + if (!animatedNodeTransforms.empty()) { + model->SetAnimatedNodeTransforms(animatedNodeTransforms); + std::cout << "[Animation] Stored " << animatedNodeTransforms.size() + << " animated node transform(s) in model" << std::endl; + } + + // Store animated node mesh mappings for linking geometry entities to animations + if (!animatedNodeMeshes.empty()) { + model->SetAnimatedNodeMeshes(animatedNodeMeshes); + std::cout << "[Animation] Stored " << animatedNodeMeshes.size() + << " animated node mesh mapping(s) in model" << std::endl; + } + + std::map geometryMaterialMeshMap; // Map from geometry+material hash to unique MaterialMesh + + // Helper function to create a geometry hash for deduplication + auto createGeometryHash = [](const tinygltf::Primitive& primitive, int materialIndex) -> std::string { + std::string hash = "mat_" + std::to_string(materialIndex); + + // Add primitive attribute hashes to ensure unique geometry identification + if (primitive.indices >= 0) { + hash += "_idx_" + std::to_string(primitive.indices); + } + + for (const auto& [attrName, type] : primitive.attributes) { + hash += "_" + attrName + "_" + std::to_string(type); + } + + return hash; + }; + + // Process all meshes with improved instancing support + for (size_t meshIndex = 0; meshIndex < gltfModel.meshes.size(); ++meshIndex) { + const auto& mesh = gltfModel.meshes[meshIndex]; + + // Check if this mesh has instances + auto instanceIt = meshInstanceTransforms.find(static_cast(meshIndex)); + std::vector instances; + + if (instanceIt == meshInstanceTransforms.end() || instanceIt->second.empty()) { + instances.emplace_back(1.0f); // Identity transform at origin + } else { + instances = instanceIt->second; + } + + // Process each primitive (material group) in this mesh + for (const auto& primitive : mesh.primitives) { + // Get the material index for this primitive + int materialIndex = primitive.material; + if (materialIndex < 0) { + materialIndex = -1; // Use -1 for primitives without materials + } + + // Create a unique geometry hash for this primitive and material combination + std::string geometryHash = createGeometryHash(primitive, materialIndex); + + // Use try_emplace to efficiently insert if not present and get reference + auto [it, inserted] = geometryMaterialMeshMap.try_emplace(geometryHash); + + if (inserted) { + // New entry was created - initialize it + MaterialMesh& materialMesh = it->second; + materialMesh.materialIndex = materialIndex; + materialMesh.sourceMeshIndex = static_cast(meshIndex); // Track source mesh for animations + + // Set material name + if (materialIndex >= 0 && materialIndex < gltfModel.materials.size()) { + const auto& gltfMaterial = gltfModel.materials[materialIndex]; + materialMesh.materialName = gltfMaterial.name.empty() ? ("material_" + std::to_string(materialIndex)) : gltfMaterial.name; + } else { + materialMesh.materialName = "no_material"; + } + } + + MaterialMesh& materialMesh = it->second; + + // Only process geometry if this MaterialMesh is empty (first time processing this geometry) + if (materialMesh.vertices.empty()) { + auto vertexOffsetInMaterialMesh = static_cast(materialMesh.vertices.size()); + + // Get indices for this primitive (your existing code is correct) + if (primitive.indices >= 0) { + const tinygltf::Accessor& indexAccessor = gltfModel.accessors[primitive.indices]; + const tinygltf::BufferView& indexBufferView = gltfModel.bufferViews[indexAccessor.bufferView]; + const tinygltf::Buffer& indexBuffer = gltfModel.buffers[indexBufferView.buffer]; + const void* indexData = &indexBuffer.data[indexBufferView.byteOffset + indexAccessor.byteOffset]; + if (indexAccessor.componentType == TINYGLTF_COMPONENT_TYPE_UNSIGNED_SHORT) { + const auto* buf = static_cast(indexData); + for (size_t i = 0; i < indexAccessor.count; ++i) { + materialMesh.indices.push_back(buf[i] + vertexOffsetInMaterialMesh); + } + } else if (indexAccessor.componentType == TINYGLTF_COMPONENT_TYPE_UNSIGNED_INT) { + const auto* buf = static_cast(indexData); + for (size_t i = 0; i < indexAccessor.count; ++i) { + materialMesh.indices.push_back(buf[i] + vertexOffsetInMaterialMesh); + } + } + } + + // --- START: FINAL SAFE AND CORRECT VERTEX LOADING --- + + // Get the position accessor, which defines the vertex count. + auto posIt = primitive.attributes.find("POSITION"); + if (posIt == primitive.attributes.end()) + continue; + const tinygltf::Accessor& posAccessor = gltfModel.accessors[posIt->second]; + + // Get data pointers and strides for all available attributes ONCE before the loop. + const tinygltf::BufferView& posBufferView = gltfModel.bufferViews[posAccessor.bufferView]; + const tinygltf::Buffer& buffer = gltfModel.buffers[posBufferView.buffer]; + const unsigned char* pPositions = &buffer.data[posBufferView.byteOffset + posAccessor.byteOffset]; + const size_t posByteStride = posBufferView.byteStride == 0 ? sizeof(glm::vec3) : posBufferView.byteStride; + + const unsigned char* pNormals = nullptr; + size_t normalByteStride = 0; + auto normalIt = primitive.attributes.find("NORMAL"); + if (normalIt != primitive.attributes.end()) { + const tinygltf::Accessor& normalAccessor = gltfModel.accessors[normalIt->second]; + const tinygltf::BufferView& normalBufferView = gltfModel.bufferViews[normalAccessor.bufferView]; + pNormals = &gltfModel.buffers[normalBufferView.buffer].data[normalBufferView.byteOffset + normalAccessor.byteOffset]; + normalByteStride = normalBufferView.byteStride == 0 ? sizeof(glm::vec3) : normalBufferView.byteStride; + } + + const unsigned char* pTexCoords = nullptr; + size_t texCoordByteStride = 0; + auto texCoordIt = primitive.attributes.find("TEXCOORD_0"); + if (texCoordIt != primitive.attributes.end()) { + const tinygltf::Accessor& texCoordAccessor = gltfModel.accessors[texCoordIt->second]; + const tinygltf::BufferView& texCoordBufferView = gltfModel.bufferViews[texCoordAccessor.bufferView]; + pTexCoords = &gltfModel.buffers[texCoordBufferView.buffer].data[texCoordBufferView.byteOffset + texCoordAccessor.byteOffset]; + texCoordByteStride = texCoordBufferView.byteStride == 0 ? sizeof(glm::vec2) : texCoordBufferView.byteStride; + } + + const unsigned char* pTangents = nullptr; + size_t tangentByteStride = 0; + auto tangentIt = primitive.attributes.find("TANGENT"); + bool hasTangents = (tangentIt != primitive.attributes.end()); + if (hasTangents) { + const tinygltf::Accessor& tangentAccessor = gltfModel.accessors[tangentIt->second]; + const tinygltf::BufferView& tangentBufferView = gltfModel.bufferViews[tangentAccessor.bufferView]; + pTangents = &gltfModel.buffers[tangentBufferView.buffer].data[tangentBufferView.byteOffset + tangentAccessor.byteOffset]; + tangentByteStride = tangentBufferView.byteStride == 0 ? sizeof(glm::vec4) : tangentBufferView.byteStride; + } + + // Append vertices for this primitive preserving prior vertices + size_t baseVertex = materialMesh.vertices.size(); + materialMesh.vertices.resize(baseVertex + posAccessor.count); + + // Use a SINGLE, SAFE loop to load all vertex data. + for (size_t i = 0; i < posAccessor.count; ++i) { + auto& [position, normal, texCoord, tangent] = materialMesh.vertices[baseVertex + i]; + + position = *reinterpret_cast(pPositions + i * posByteStride); + + if (pNormals) { + normal = *reinterpret_cast(pNormals + i * normalByteStride); + } else { + normal = glm::vec3(0.0f, 0.0f, 1.0f); + } + // Normalize normals to ensure consistent magnitude + if (glm::dot(normal, normal) > 0.0f) { + normal = glm::normalize(normal); + } else { + normal = glm::vec3(0.0f, 0.0f, 1.0f); + } + + if (pTexCoords) { + texCoord = *reinterpret_cast(pTexCoords + i * texCoordByteStride); + } else { + texCoord = glm::vec2(0.0f, 0.0f); + } + + if (hasTangents && pTangents) { + // Load glTF tangent and ensure it is normalized and orthogonal to the normal. + glm::vec4 t4 = *reinterpret_cast(pTangents + i * tangentByteStride); + glm::vec3 T = glm::vec3(t4); + // Normalize tangent and make it orthogonal to normal to avoid skewed TBN + if (glm::dot(T, T) > 0.0f) { + T = glm::normalize(T); + T = glm::normalize(T - normal * glm::dot(normal, T)); + } else { + T = glm::vec3(1.0f, 0.0f, 0.0f); + } + float w = (t4.w >= 0.0f) ? 1.0f : -1.0f; // clamp handedness to +/-1 + tangent = glm::vec4(T, w); + } else { + // No tangents in source: use a safe default tangent (T=+X, handedness=+1) + tangent = glm::vec4(1.0f, 0.0f, 0.0f, 1.0f); + } + } + + // AFTER the mesh is fully built, generate tangents via MikkTSpace ONLY if the source mesh lacks glTF tangents. + if (!hasTangents) { + if (pNormals && pTexCoords && !materialMesh.indices.empty()) { + MikkTSpaceInterface mikkInterface; + mikkInterface.vertices = &materialMesh.vertices; + mikkInterface.indices = &materialMesh.indices; + + SMikkTSpaceInterface sm_interface{}; + sm_interface.m_getNumFaces = getNumFaces; + sm_interface.m_getNumVerticesOfFace = getNumVerticesOfFace; + sm_interface.m_getPosition = getPosition; + sm_interface.m_getNormal = getNormal; + sm_interface.m_getTexCoord = getTexCoord; + sm_interface.m_setTSpaceBasic = setTSpaceBasic; + + SMikkTSpaceContext mikk_context{}; + mikk_context.m_pInterface = &sm_interface; + mikk_context.m_pUserData = &mikkInterface; + + if (genTangSpaceDefault(&mikk_context)) { + std::cout << " Generated tangents (MikkTSpace) for material: " << materialMesh.materialName << std::endl; + } else { + std::cerr << " Failed to generate tangents for material: " << materialMesh.materialName << std::endl; + } + } else { + std::cout << " Skipping tangent generation (missing normals, UVs, or indices) for material: " << materialMesh.materialName << std::endl; + } + } else { + std::cout << " Using glTF-provided tangents for material: " << materialMesh.materialName << std::endl; + } + // --- END: FINAL SAFE AND CORRECT VERTEX LOADING --- + } + + // Add all instances to this MaterialMesh (both new and existing geometry) + for (const glm::mat4& instanceTransform : instances) { + materialMesh.AddInstance(instanceTransform, static_cast(materialIndex)); + } + } + } + + // Convert geometry-based material mesh map to vector + std::vector modelMaterialMeshes; + modelMaterialMeshes.reserve(geometryMaterialMeshMap.size()); + for (auto& kv : geometryMaterialMeshMap) { + modelMaterialMeshes.push_back(std::move(kv.second)); + } + + // Process texture loading for each MaterialMesh + std::vector combinedVertices; + std::vector combinedIndices; + + // Reserve space for combined mesh data to avoid reallocations + size_t totalVertices = 0; + size_t totalIndices = 0; + for (const auto& materialMesh : modelMaterialMeshes) { + if (!materialMesh.instances.empty()) { + totalVertices += materialMesh.vertices.size(); + totalIndices += materialMesh.indices.size(); + } + } + combinedVertices.reserve(totalVertices); + combinedIndices.reserve(totalIndices); + + // Process texture loading for each MaterialMesh + for (auto& materialMesh : modelMaterialMeshes) { + int materialIndex = materialMesh.materialIndex; + + // Get ALL texture paths for this material (same as ParseGLTFDataOnly) + if (materialIndex >= 0 && materialIndex < gltfModel.materials.size()) { + const auto& gltfMaterial = gltfModel.materials[materialIndex]; + + // Extract base color texture + if (gltfMaterial.pbrMetallicRoughness.baseColorTexture.index >= 0) { + int texIndex = gltfMaterial.pbrMetallicRoughness.baseColorTexture.index; + if (texIndex < gltfModel.textures.size()) { + const auto& texture = gltfModel.textures[texIndex]; + int imageIndex = -1; + if (texture.source >= 0 && texture.source < gltfModel.images.size()) { + imageIndex = texture.source; + } else { + auto extIt = texture.extensions.find("KHR_texture_basisu"); + if (extIt != texture.extensions.end()) { + const tinygltf::Value& ext = extIt->second; + if (ext.Has("source") && ext.Get("source").IsInt()) { + int src = ext.Get("source").Get(); + if (src >= 0 && src < static_cast(gltfModel.images.size())) { + imageIndex = src; + } + } + } + } + if (imageIndex >= 0) { + std::string textureId = "gltf_baseColor_" + std::to_string(texIndex); + materialMesh.baseColorTexturePath = textureId; + materialMesh.texturePath = textureId; // Keep for backward compatibility (now baseColor‑tagged) + + // Load texture data (embedded or external) with caching + const auto& image = gltfModel.images[imageIndex]; + if (!image.image.empty()) { + if (!loadedTextures.contains(textureId)) { + renderer->LoadTextureFromMemoryAsync(textureId, image.image.data(), image.width, image.height, image.component, true); + loadedTextures.insert(textureId); + } + } else { + std::cerr << " Warning: No decoded bytes for baseColor texture index " << texIndex << std::endl; + } + } + } + } else { + // Since texture indices are -1, try to find external texture files by material name + std::string materialName = materialMesh.materialName; + + // Look for external texture files that match this specific material (case-insensitive) + for (const auto& image : gltfModel.images) { + if (!image.uri.empty()) { + std::string imageUri = image.uri; + // Lowercase copies for robust matching + std::string imageUriLower = imageUri; + std::ranges::transform(imageUriLower, imageUriLower.begin(), [](unsigned char c) { return static_cast(std::tolower(c)); }); + std::string materialNameLower = materialName; + std::ranges::transform(materialNameLower, materialNameLower.begin(), [](unsigned char c) { return static_cast(std::tolower(c)); }); + + // Check if this image belongs to this specific material based on naming patterns + // Look for basecolor/albedo/diffuse textures that match the material name + if ((imageUriLower.find("basecolor") != std::string::npos || + imageUriLower.find("albedo") != std::string::npos || + imageUriLower.find("diffuse") != std::string::npos) && + (imageUriLower.find(materialNameLower) != std::string::npos || + materialNameLower.find(imageUriLower.substr(0, imageUriLower.find('_'))) != std::string::npos)) { + // Use the relative path from the GLTF directory + std::string textureId = baseTexturePath + imageUri; + if (!image.image.empty()) { + renderer->LoadTextureFromMemoryAsync(textureId, image.image.data(), image.width, image.height, image.component); + materialMesh.baseColorTexturePath = textureId; + materialMesh.texturePath = textureId; + } else { + // Fallback: offload KTX2 file load to renderer worker threads + renderer->LoadTextureAsync(textureId, true); + materialMesh.baseColorTexturePath = textureId; + materialMesh.texturePath = textureId; + } + break; + } + } + } + } + + // Extract normal texture + if (gltfMaterial.normalTexture.index >= 0) { + int texIndex = gltfMaterial.normalTexture.index; + if (texIndex < gltfModel.textures.size()) { + const auto& texture = gltfModel.textures[texIndex]; + if (texture.source >= 0 && texture.source < gltfModel.images.size()) { + std::string textureId = "gltf_texture_" + std::to_string(texIndex); + materialMesh.normalTexturePath = textureId; + + // Load texture data (embedded or external) + const auto& image = gltfModel.images[texture.source]; + if (!image.image.empty()) { + // Load embedded texture data + renderer->LoadTextureFromMemoryAsync(textureId, image.image.data(), image.width, image.height, image.component); + } else if (!image.uri.empty()) { + // Fallback: offload KTX2 normal map load to renderer worker threads + std::string filePath = baseTexturePath + image.uri; + renderer->RegisterTextureAlias(textureId, filePath); + renderer->LoadTextureAsync(filePath); + materialMesh.normalTexturePath = textureId; + } else { + std::cerr << " Warning: No decoded bytes for normal texture index " << texIndex << std::endl; + } + } + } + } else { + // Heuristic: search images for a normal texture for this material and load from memory + std::string materialName = materialMesh.materialName; + for (const auto& image : gltfModel.images) { + if (!image.uri.empty()) { + std::string imageUri = image.uri; + if (imageUri.find("Normal") != std::string::npos && + (imageUri.find(materialName) != std::string::npos || + materialName.find(imageUri.substr(0, imageUri.find('_'))) != std::string::npos)) { + std::string textureId = baseTexturePath + imageUri; + if (!image.image.empty()) { + renderer->LoadTextureFromMemoryAsync(textureId, image.image.data(), image.width, image.height, image.component); + materialMesh.normalTexturePath = textureId; + } else { + std::cerr << " Warning: Heuristic normal image has no decoded bytes: " << imageUri << std::endl; + } + break; + } + } + } + } + + // Extract metallic-roughness texture + if (gltfMaterial.pbrMetallicRoughness.metallicRoughnessTexture.index >= 0) { + int texIndex = gltfMaterial.pbrMetallicRoughness.metallicRoughnessTexture.index; + if (texIndex < gltfModel.textures.size()) { + const auto& texture = gltfModel.textures[texIndex]; + if (texture.source >= 0 && texture.source < gltfModel.images.size()) { + std::string textureId = "gltf_texture_" + std::to_string(texIndex); + materialMesh.metallicRoughnessTexturePath = textureId; + + // Load texture data (embedded or external) + const auto& image = gltfModel.images[texture.source]; + if (!image.image.empty()) { + renderer->LoadTextureFromMemoryAsync(textureId, image.image.data(), image.width, image.height, image.component); + materialMesh.metallicRoughnessTexturePath = textureId; + } else { + std::cerr << " Warning: No decoded bytes for metallic-roughness texture index " << texIndex << std::endl; + } + } + } + } else { + // Look for external metallic-roughness texture files that match this specific material + std::string materialName = materialMesh.materialName; + for (const auto& image : gltfModel.images) { + if (!image.uri.empty()) { + std::string imageUri = image.uri; + if ((imageUri.find("Metallic") != std::string::npos || + imageUri.find("Roughness") != std::string::npos || + imageUri.find("Specular") != std::string::npos) && + (imageUri.find(materialName) != std::string::npos || + materialName.find(imageUri.substr(0, imageUri.find('_'))) != std::string::npos)) { + std::string texturePath = baseTexturePath + imageUri; + materialMesh.metallicRoughnessTexturePath = texturePath; + std::cout << " Found external metallic-roughness texture for " << materialName << ": " << texturePath << std::endl; + break; + } + } + } + } + + // Extract occlusion texture + if (gltfMaterial.occlusionTexture.index >= 0) { + int texIndex = gltfMaterial.occlusionTexture.index; + if (texIndex < gltfModel.textures.size()) { + const auto& texture = gltfModel.textures[texIndex]; + if (texture.source >= 0 && texture.source < gltfModel.images.size()) { + std::string textureId = "gltf_texture_" + std::to_string(texIndex); + materialMesh.occlusionTexturePath = textureId; + + // Load texture data (embedded or external) + const auto& image = gltfModel.images[texture.source]; + if (!image.image.empty()) { + if (renderer->LoadTextureFromMemory(textureId, + image.image.data(), + image.width, + image.height, + image.component)) { + materialMesh.occlusionTexturePath = textureId; + std::cout << " Loaded occlusion texture from memory: " << textureId + << " (" << image.width << "x" << image.height << ")" << std::endl; + } else { + std::cerr << " Failed to load occlusion texture from memory: " << textureId << std::endl; + } + } else { + std::cerr << " Warning: No decoded bytes for occlusion texture index " << texIndex << std::endl; + } + } + } + } else { + // Heuristic: search images for an occlusion texture for this material and load from memory + std::string materialName = materialMesh.materialName; + for (const auto& image : gltfModel.images) { + if (!image.uri.empty()) { + std::string imageUri = image.uri; + if ((imageUri.find("Occlusion") != std::string::npos || + imageUri.find("AO") != std::string::npos) && + (imageUri.find(materialName) != std::string::npos || + materialName.find(imageUri.substr(0, imageUri.find('_'))) != std::string::npos)) { + std::string textureId = baseTexturePath + imageUri; + if (!image.image.empty()) { + renderer->LoadTextureFromMemoryAsync(textureId, image.image.data(), image.width, image.height, image.component); + materialMesh.occlusionTexturePath = textureId; + } else { + std::cerr << " Warning: Heuristic occlusion image has no decoded bytes: " << imageUri << std::endl; + } + break; + } + } + } + } + + // Extract emissive texture + if (gltfMaterial.emissiveTexture.index >= 0) { + int texIndex = gltfMaterial.emissiveTexture.index; + if (texIndex < gltfModel.textures.size()) { + const auto& texture = gltfModel.textures[texIndex]; + if (texture.source >= 0 && texture.source < gltfModel.images.size()) { + std::string textureId = "gltf_texture_" + std::to_string(texIndex); + materialMesh.emissiveTexturePath = textureId; + + // Load texture data (embedded or external) + const auto& image = gltfModel.images[texture.source]; + if (!image.image.empty()) { + // Load embedded texture data + renderer->LoadTextureFromMemoryAsync(textureId, image.image.data(), image.width, image.height, image.component); + } else if (!image.uri.empty()) { + // Record external texture file path (loaded later by renderer) + std::string texturePath = baseTexturePath + image.uri; + materialMesh.emissiveTexturePath = texturePath; + } + } + } + } else { + // Look for external emissive texture files that match this specific material + std::string materialName = materialMesh.materialName; + for (const auto& image : gltfModel.images) { + if (!image.uri.empty()) { + std::string imageUri = image.uri; + if ((imageUri.find("Emissive") != std::string::npos || + imageUri.find("Emission") != std::string::npos) && + (imageUri.find(materialName) != std::string::npos || + materialName.find(imageUri.substr(0, imageUri.find('_'))) != std::string::npos)) { + std::string texturePath = baseTexturePath + imageUri; + materialMesh.emissiveTexturePath = texturePath; + break; + } + } + } + } + } + + // Add to combined mesh for backward compatibility (keep vertices in an original coordinate system) + if (!materialMesh.instances.empty()) { + size_t vertexOffset = combinedVertices.size(); + + // Instance transforms should be handled by the instancing system, not applied to vertex data + for (const auto& vertex : materialMesh.vertices) { + // Use vertices as-is without any transformation + combinedVertices.push_back(vertex); + } + + for (uint32_t index : materialMesh.indices) { + combinedIndices.push_back(index + static_cast(vertexOffset)); + } + } + } + + // Store material meshes for this model + materialMeshes[filename] = modelMaterialMeshes; + + // Set the combined mesh data in the model for backward compatibility + model->SetVertices(combinedVertices); + model->SetIndices(combinedIndices); + + // Extract lights from the GLTF model + std::cout << "Extracting lights from GLTF model..." << std::endl; + + // Extract punctual lights (KHR_lights_punctual extension) + if (ExtractPunctualLights(gltfModel, filename)) { + std::cerr << "Warning: Failed to extract punctual lights from " << filename << std::endl; + } + + std::cout << "GLTF model loaded successfully with " << combinedVertices.size() << " vertices and " << combinedIndices.size() << " indices" << std::endl; + return true; +} + +std::vector ModelLoader::GetExtractedLights(const std::string& modelName) const { + std::vector lights; + + // First, try to get punctual lights from the extracted lights storage + auto lightIt = extractedLights.find(modelName); + if (lightIt != extractedLights.end()) { + lights = lightIt->second; + std::cout << "Found " << lights.size() << " punctual lights for model: " << modelName << std::endl; + } + + // Now extract emissive materials as light sources + auto materialMeshIt = materialMeshes.find(modelName); + if (materialMeshIt != materialMeshes.end()) { + for (const auto& materialMesh : materialMeshIt->second) { + // Get the material for this mesh + auto materialIt = materials.find(materialMesh.materialName); + if (materialIt != materials.end()) { + const Material* material = materialIt->second.get(); + + // Check if this material has emissive properties (no threshold filtering) + float emissiveIntensity = glm::length(material->emissive) * material->emissiveStrength; + if (emissiveIntensity >= 0.1f) { + // Calculate the center position and an approximate size of the emissive surface + glm::vec3 center(0.0f); + glm::vec3 minB(std::numeric_limits::max()); + glm::vec3 maxB(-std::numeric_limits::max()); + if (!materialMesh.vertices.empty()) { + for (const auto& vertex : materialMesh.vertices) { + center += vertex.position; + minB = glm::min(minB, vertex.position); + maxB = glm::max(maxB, vertex.position); + } + center /= static_cast(materialMesh.vertices.size()); + } + glm::vec3 extent = glm::max(maxB - minB, glm::vec3(0.0f)); + float diag = glm::length(extent); + float baseRange = std::max(0.5f * diag, 0.25f); // base range in local units + + // Calculate a reasonable direction (average normal of the surface) + glm::vec3 avgNormal(0.0f); + if (!materialMesh.vertices.empty()) { + avgNormal = std::accumulate( + materialMesh.vertices.begin(), + materialMesh.vertices.end(), + glm::vec3(0.0f), + [](const glm::vec3& acc, const Vertex& vertex) { return acc + vertex.normal; } + ); + avgNormal = glm::normalize(avgNormal / static_cast(materialMesh.vertices.size())); + } else { + avgNormal = glm::vec3(0.0f, -1.0f, 0.0f); // Default downward direction + } + + // Create emissive light(s) transformed by each instance's model matrix + if (!materialMesh.instances.empty()) { + for (const auto& inst : materialMesh.instances) { + glm::mat4 M = inst.getModelMatrix(); + glm::vec3 worldCenter = glm::vec3(M * glm::vec4(center, 1.0f)); + glm::mat3 normalMat = glm::transpose(glm::inverse(glm::mat3(M))); + glm::vec3 worldNormal = glm::normalize(normalMat * avgNormal); + + // Estimate a uniform scale factor from the instance transform + float sx = glm::length(glm::vec3(M[0])); + float sy = glm::length(glm::vec3(M[1])); + float sz = glm::length(glm::vec3(M[2])); + float sMax = std::max(sx, std::max(sy, sz)); + // Slightly conservative halo; avoid massive ranges that wash out the scene + float worldRange = baseRange * std::max(1.0f, sMax) * 1.25f; + + ExtractedLight emissiveLight; + emissiveLight.type = ExtractedLight::Type::Emissive; + emissiveLight.position = worldCenter; + // Separate chroma from intensity to avoid double-powering color and intensity + glm::vec3 chroma = material->emissive; + float chromaMag = glm::length(chroma); + emissiveLight.color = (chromaMag > 1e-6f) ? (chroma / chromaMag) : chroma; + float strength = hasEmissiveStrengthExtension ? material->emissiveStrength : 1.0f; + // Use a surface-area proxy from local bounds (diag^2) scaled by instance size, not range^2 + float areaProxy = std::max(diag * diag * std::max(1.0f, sMax), 0.01f); + float intensityRaw = strength * chromaMag * areaProxy * 0.08f; // conservative scalar + // Clamp to a reasonable band to avoid blowing out exposure + emissiveLight.intensity = glm::clamp(intensityRaw, 0.25f, 50.0f); + emissiveLight.range = worldRange; + emissiveLight.sourceMaterial = material->GetName(); + emissiveLight.direction = worldNormal; + + lights.push_back(emissiveLight); + + std::cout << "Created emissive light from material '" << material->GetName() + << "' at world position (" << worldCenter.x << ", " << worldCenter.y << ", " << worldCenter.z + << ") with intensity " << emissiveIntensity << std::endl; + } + } else { + // No explicit instances; use identity transform + ExtractedLight emissiveLight; + emissiveLight.type = ExtractedLight::Type::Emissive; + emissiveLight.position = center; + // Separate chroma from intensity + glm::vec3 chroma = material->emissive; + float chromaMag = glm::length(chroma); + emissiveLight.color = (chromaMag > 1e-6f) ? (chroma / chromaMag) : chroma; + float strength = hasEmissiveStrengthExtension ? material->emissiveStrength : 1.0f; + float worldRange = baseRange * 1.25f; + float areaProxy = std::max(diag * diag, 0.01f); + float intensityRaw = strength * chromaMag * areaProxy * 0.08f; + emissiveLight.intensity = glm::clamp(intensityRaw, 0.25f, 50.0f); + emissiveLight.range = worldRange; + emissiveLight.sourceMaterial = material->GetName(); + emissiveLight.direction = avgNormal; + + lights.push_back(emissiveLight); + + std::cout << "Created emissive light from material '" << material->GetName() + << "' at position (" << center.x << ", " << center.y << ", " << center.z + << ") with intensity " << emissiveIntensity << std::endl; + } + } + } + } + } + + std::cout << "Total lights extracted for model '" << modelName << "': " << lights.size() + << " (including emissive-derived lights)" << std::endl; + + return lights; +} + +const std::vector& ModelLoader::GetMaterialMeshes(const std::string& modelName) const { + auto it = materialMeshes.find(modelName); + if (it != materialMeshes.end()) { + return it->second; + } + // Return a static empty vector to avoid creating temporary objects. + static const std::vector emptyVector; + return emptyVector; +} + +const Material* ModelLoader::GetMaterial(const std::string& materialName) const { + auto it = materials.find(materialName); + if (it != materials.end()) { + return it->second.get(); + } + return nullptr; +} + +const Material* ModelLoader::GetMaterialByIndex(uint32_t materialIndex) const { + if (materialIndex < materialsByIndex.size()) { + return materialsByIndex[materialIndex]; + } + return nullptr; +} + +const std::vector& ModelLoader::GetAnimations(const std::string& modelName) const { + auto it = models.find(modelName); + if (it != models.end() && it->second) { + return it->second->GetAnimations(); + } + // Return a static empty vector to avoid creating temporary objects. + static const std::vector emptyVector; + return emptyVector; +} + +bool ModelLoader::ExtractPunctualLights(const tinygltf::Model& gltfModel, const std::string& modelName) { + std::cout << "Extracting punctual lights from model: " << modelName << std::endl; + + std::vector lights; + + // Check if the model has the KHR_lights_punctual extension + auto extensionIt = gltfModel.extensions.find("KHR_lights_punctual"); + if (extensionIt != gltfModel.extensions.end()) { + std::cout << " Found KHR_lights_punctual extension" << std::endl; + + // Parse the punctual lights from the extension + const tinygltf::Value& extension = extensionIt->second; + if (extension.Has("lights") && extension.Get("lights").IsArray()) { + const tinygltf::Value::Array& lightsArray = extension.Get("lights").Get(); + + for (size_t i = 0; i < lightsArray.size(); ++i) { + const tinygltf::Value& lightValue = lightsArray[i]; + if (!lightValue.IsObject()) + continue; + + ExtractedLight light; + + // Parse light type + if (lightValue.Has("type") && lightValue.Get("type").IsString()) { + std::string type = lightValue.Get("type").Get(); + if (type == "directional") { + light.type = ExtractedLight::Type::Directional; + } else if (type == "point") { + light.type = ExtractedLight::Type::Point; + } else if (type == "spot") { + light.type = ExtractedLight::Type::Spot; + } + } + + // Parse light color + if (lightValue.Has("color") && lightValue.Get("color").IsArray()) { + const tinygltf::Value::Array& colorArray = lightValue.Get("color").Get(); + if (colorArray.size() >= 3) { + light.color = glm::vec3( + colorArray[0].IsNumber() ? static_cast(colorArray[0].Get()) : 1.0f, + colorArray[1].IsNumber() ? static_cast(colorArray[1].Get()) : 1.0f, + colorArray[2].IsNumber() ? static_cast(colorArray[2].Get()) : 1.0f); + } + } + + // Parse light intensity + if (lightValue.Has("intensity") && lightValue.Get("intensity").IsNumber()) { + light.intensity = static_cast(lightValue.Get("intensity").Get()) * LIGHT_SCALE_FACTOR; + } + + // Parse light range (for point and spotlights) + if (lightValue.Has("range") && lightValue.Get("range").IsNumber()) { + light.range = static_cast(lightValue.Get("range").Get()); + } + + // Parse spotlights specific parameters + if (light.type == ExtractedLight::Type::Spot && lightValue.Has("spot")) { + const tinygltf::Value& spotValue = lightValue.Get("spot"); + if (spotValue.Has("innerConeAngle") && spotValue.Get("innerConeAngle").IsNumber()) { + light.innerConeAngle = static_cast(spotValue.Get("innerConeAngle").Get()); + } + if (spotValue.Has("outerConeAngle") && spotValue.Get("outerConeAngle").IsNumber()) { + light.outerConeAngle = static_cast(spotValue.Get("outerConeAngle").Get()); + } + } + + lights.push_back(light); + std::cout << " Parsed punctual light " << i << ": type=" << static_cast(light.type) + << ", intensity=" << light.intensity << std::endl; + } + } + } else { + std::cout << " No KHR_lights_punctual extension found" << std::endl; + } + + // Compute world transforms for all nodes in the default scene + std::vector nodeWorldTransforms(gltfModel.nodes.size(), glm::mat4(1.0f)); + + auto calcLocal = [](const tinygltf::Node& n) -> glm::mat4 { + // If matrix is provided, use it + if (n.matrix.size() == 16) { + glm::mat4 m(1.0f); + for (int r = 0; r < 4; ++r) { + for (int c = 0; c < 4; ++c) { + m[c][r] = static_cast(n.matrix[r * 4 + c]); + } + } + return m; + } + // Otherwise compose TRS + glm::mat4 T(1.0f), R(1.0f), S(1.0f); + if (n.translation.size() == 3) { + T = glm::translate(glm::mat4(1.0f), + glm::vec3( + static_cast(n.translation[0]), + static_cast(n.translation[1]), + static_cast(n.translation[2]))); + } + if (n.rotation.size() == 4) { + glm::quat q( + static_cast(n.rotation[3]), + static_cast(n.rotation[0]), + static_cast(n.rotation[1]), + static_cast(n.rotation[2])); + R = glm::mat4_cast(q); + } + if (n.scale.size() == 3) { + S = glm::scale(glm::mat4(1.0f), + glm::vec3( + static_cast(n.scale[0]), + static_cast(n.scale[1]), + static_cast(n.scale[2]))); + } + return T * R * S; + }; + + std::function < void(int, const glm::mat4 &) > traverseNode = [&](int nodeIndex, const glm::mat4& parent) { + if (nodeIndex < 0 || nodeIndex >= static_cast(gltfModel.nodes.size())) + return; + const tinygltf::Node& n = gltfModel.nodes[nodeIndex]; + glm::mat4 local = calcLocal(n); + glm::mat4 world = parent * local; + nodeWorldTransforms[nodeIndex] = world; + for (int child : n.children) { + traverseNode(child, world); + } + }; + + if (!gltfModel.scenes.empty()) { + int sceneIndex = gltfModel.defaultScene >= 0 ? gltfModel.defaultScene : 0; + if (sceneIndex < static_cast(gltfModel.scenes.size())) { + const tinygltf::Scene& scene = gltfModel.scenes[sceneIndex]; + for (int root : scene.nodes) { + traverseNode(root, glm::mat4(1.0f)); + } + } + } else { + // Fallback: traverse all nodes as roots + for (int i = 0; i < static_cast(gltfModel.nodes.size()); ++i) { + traverseNode(i, glm::mat4(1.0f)); + } + } + + // Now assign positions and directions using world transforms + for (size_t nodeIndex = 0; nodeIndex < gltfModel.nodes.size(); ++nodeIndex) { + const auto& node = gltfModel.nodes[nodeIndex]; + if (node.extensions.contains("KHR_lights_punctual")) { + const tinygltf::Value& nodeExtension = node.extensions.at("KHR_lights_punctual"); + if (nodeExtension.Has("light") && nodeExtension.Get("light").IsInt()) { + int lightIndex = nodeExtension.Get("light").Get(); + if (lightIndex >= 0 && lightIndex < static_cast(lights.size())) { + const glm::mat4& W = nodeWorldTransforms[nodeIndex]; + // Position from world transform origin + glm::vec3 pos = glm::vec3(W * glm::vec4(0, 0, 0, 1)); + lights[lightIndex].position = pos; + + // Direction for directional/spot: transform -Z + if (lights[lightIndex].type == ExtractedLight::Type::Directional || + lights[lightIndex].type == ExtractedLight::Type::Spot) { + glm::mat3 rot = glm::mat3(W); + glm::vec3 dir = glm::normalize(rot * glm::vec3(0.0f, 0.0f, -1.0f)); + lights[lightIndex].direction = dir; + } + + std::cout << " Light " << lightIndex << " positioned at (" + << lights[lightIndex].position.x << ", " + << lights[lightIndex].position.y << ", " + << lights[lightIndex].position.z << ")" << std::endl; + } + } + } + } + + // Store the extracted lights + extractedLights[modelName] = lights; + + std::cout << " Extracted " << lights.size() << " total lights from model" << std::endl; + return lights.empty(); +} diff --git a/attachments/sync2_engine/physics_system.cpp b/attachments/sync2_engine/physics_system.cpp new file mode 100644 index 00000000..c8ec5b9f --- /dev/null +++ b/attachments/sync2_engine/physics_system.cpp @@ -0,0 +1,1402 @@ +/* Copyright (c) 2025 Holochip Corporation + * + * SPDX-License-Identifier: Apache-2.0 + * + * Licensed under the Apache License, Version 2.0 the "License"; + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +#include "physics_system.h" +#include "entity.h" +#include "mesh_component.h" +#include "renderer.h" +#include "transform_component.h" +#include + +#include +#include +#include +#include +#include +#include + +// Physics constants +constexpr float TENNIS_BALL_RADIUS = 0.0335f; // meters + +// Concrete implementation of RigidBody +class ConcreteRigidBody final : public RigidBody { + public: + ConcreteRigidBody(Entity* entity, CollisionShape shape, float mass) : entity(entity), shape(shape), mass(mass) { + // Initialize with the entity's transform if available + if (entity) { + // Get the position, rotation, and scale from the entity's transform component + if (auto* transform = entity->GetComponent()) { + position = transform->GetPosition(); + rotation = glm::quat(transform->GetRotation()); // Convert from Euler angles to quaternion + scale = transform->GetScale(); + } else { + // Fallback to defaults if no transform component + position = glm::vec3(0.0f); + rotation = glm::quat(1.0f, 0.0f, 0.0f, 0.0f); // Identity quaternion + scale = glm::vec3(1.0f); + } + } + } + + ~ConcreteRigidBody() override = default; + + void SetPosition(const glm::vec3& _position) override { + position = _position; + + // Update entity transform component for visual representation + if (entity) { + if (auto* transform = entity->GetComponent()) { + transform->SetPosition(_position); + } + } + } + + void SetRotation(const glm::quat& _rotation) override { + rotation = _rotation; + + // Update entity transform component for visual representation + if (entity) { + if (auto* transform = entity->GetComponent()) { + // Convert quaternion to Euler angles for the transform component + glm::vec3 eulerAngles = glm::eulerAngles(_rotation); + transform->SetRotation(eulerAngles); + } + } + } + + void SetScale(const glm::vec3& _scale) override { + scale = _scale; + } + + void SetMass(float _mass) override { + mass = _mass; + } + + void SetRestitution(float _restitution) override { + restitution = _restitution; + } + + void SetFriction(float _friction) override { + friction = _friction; + } + + void ApplyForce(const glm::vec3& force, const glm::vec3& localPosition) override { + // In a real implementation, this would apply the force to the rigid body + linearVelocity += force / mass; + } + + void ApplyImpulse(const glm::vec3& impulse, const glm::vec3& localPosition) override { + // In a real implementation, this would apply the impulse to the rigid body + linearVelocity += impulse / mass; + } + + void SetLinearVelocity(const glm::vec3& velocity) override { + linearVelocity = velocity; + } + + void SetAngularVelocity(const glm::vec3& velocity) override { + angularVelocity = velocity; + } + + [[nodiscard]] glm::vec3 GetPosition() const override { + return position; + } + + [[nodiscard]] glm::quat GetRotation() const override { + return rotation; + } + + [[nodiscard]] glm::vec3 GetLinearVelocity() const override { + return linearVelocity; + } + + [[nodiscard]] glm::vec3 GetAngularVelocity() const override { + return angularVelocity; + } + + void SetKinematic(bool _kinematic) override { + // Prevent balls from being set as kinematic - they should always be dynamic + if (entity && entity->GetName().find("Ball_") == 0 && _kinematic) { + return; + } + + kinematic = _kinematic; + gpuNeedsUpdate = true; + } + + [[nodiscard]] bool IsKinematic() const override { + return kinematic; + } + + bool gpuNeedsUpdate = true; + + [[nodiscard]] Entity* GetEntity() const { + return entity; + } + + [[nodiscard]] CollisionShape GetShape() const { + return shape; + } + + [[nodiscard]] float GetMass() const { + return mass; + } + + [[nodiscard]] float GetInverseMass() const { + return mass > 0.0f ? 1.0f / mass : 0.0f; + } + + [[nodiscard]] float GetRestitution() const { + return restitution; + } + + [[nodiscard]] float GetFriction() const { + return friction; + } + + private: + Entity* entity = nullptr; + CollisionShape shape; + + glm::vec3 position = glm::vec3(0.0f); + glm::quat rotation = glm::quat(1.0f, 0.0f, 0.0f, 0.0f); // Identity quaternion + glm::vec3 scale = glm::vec3(1.0f); + + glm::vec3 linearVelocity = glm::vec3(0.0f); + glm::vec3 angularVelocity = glm::vec3(0.0f); + + float mass = 1.0f; + float restitution = 0.5f; + float friction = 0.5f; + + bool kinematic = false; + bool markedForRemoval = false; // Flag to mark physics body for removal + + friend class PhysicsSystem; +}; + +PhysicsSystem::~PhysicsSystem() { + // Destructor implementation + if (initialized && gpuAccelerationEnabled) { + CleanupVulkanResources(); + } + // rigidBodies vector automatically cleared on destruction +} + +void PhysicsSystem::CreateMappedBuffer(vk::DeviceSize size, + vk::BufferUsageFlags usage, + vk::raii::Buffer& buffer, + vk::raii::DeviceMemory& memory, + const std::string& errorPrefix) { + const vk::raii::Device& raiiDevice = renderer->GetRaiiDevice(); + vk::BufferCreateInfo bufferInfo{ + .size = size, + .usage = usage, + .sharingMode = vk::SharingMode::eExclusive + }; + + try { + buffer = vk::raii::Buffer(raiiDevice, bufferInfo); + + vk::MemoryRequirements memRequirements = buffer.getMemoryRequirements(); + + vk::MemoryAllocateInfo allocInfo{ + .allocationSize = memRequirements.size, + .memoryTypeIndex = renderer->FindMemoryType( + memRequirements.memoryTypeBits, + vk::MemoryPropertyFlagBits::eHostVisible | vk::MemoryPropertyFlagBits::eHostCoherent) + }; + + memory = vk::raii::DeviceMemory(raiiDevice, allocInfo); + buffer.bindMemory(*memory, 0); + } catch (const std::exception& e) { + throw std::runtime_error(errorPrefix + std::string(e.what())); + } +} + +bool PhysicsSystem::Initialize() { + // Enforce GPU-only physics. If GPU resources cannot be initialized, initialization fails. + + // Renderer must be set for GPU compute physics + if (!renderer) { + std::cerr << "PhysicsSystem::Initialize: Renderer is not set. GPU-only physics cannot proceed." << std::endl; + return false; + } + + // Always keep GPU acceleration enabled (CPU fallback is not allowed) + gpuAccelerationEnabled = true; + + // Initialize Vulkan resources; fail hard if not available + if (!InitializeVulkanResources()) { + std::cerr << "PhysicsSystem::Initialize: Failed to initialize Vulkan resources for physics (GPU-only)." << std::endl; + return false; + } + + initialized = true; + return true; +} + +void PhysicsSystem::Update(std::chrono::milliseconds deltaTime, uint64_t timelineValue, uint32_t frameIndex) { + auto startUpdate = std::chrono::steady_clock::now(); + // Drain any pending rigid body creations queued from background threads + std::vector toCreate; { + std::lock_guard lk(pendingMutex); + if (!pendingCreations.empty()) { + // Time-slice creations to avoid massive frame hangs on huge scene loads + // Reduced to 20 to keep the progress bar moving smoothly and UI responsive. + const size_t maxCreationsPerFrame = 20; + size_t count = std::min(pendingCreations.size(), maxCreationsPerFrame); + toCreate.assign(pendingCreations.begin(), pendingCreations.begin() + count); + pendingCreations.erase(pendingCreations.begin(), pendingCreations.begin() + count); + } + } + uint32_t processed = 0; + for (const auto& pc : toCreate) { + if (++processed % 5 == 0 && renderer) { + renderer->KickWatchdog(); + } + if (!pc.entity) + continue; + + // Check size limit with proper locking (CreateRigidBody will acquire the lock again, but that's safe) + { + std::lock_guard lock(rigidBodiesMutex); + if (rigidBodies.size() >= maxGPUObjects) + break; // avoid oversubscription + } + + RigidBody* rb = CreateRigidBody(pc.entity, pc.shape, pc.mass); + if (rb) { + rb->SetKinematic(pc.kinematic); + rb->SetRestitution(pc.restitution); + rb->SetFriction(pc.friction); + } + } + + // GPU-ONLY physics - NO CPU fallback available + + // Check if GPU physics is properly initialized and available + bool canUseGPUPhysics = false; { + std::lock_guard lock(rigidBodiesMutex); + canUseGPUPhysics = (rigidBodies.size() <= maxGPUObjects); + } + + if (initialized&& gpuAccelerationEnabled && renderer && canUseGPUPhysics) { + SimulatePhysicsOnGPU(deltaTime, timelineValue, frameIndex); + } else { + // NO CPU FALLBACK - GPU physics must work, or physics is disabled. + // IMPORTANT: Even if physics is disabled, we MUST signal the timeline milestone + // so the renderer (which waits for physics) does not deadlock. + if (renderer && timelineValue > 0) { + // Only signal if the value strictly advances the frame timeline; otherwise skip to avoid + // out-of-order (smaller-than-current) signals that violate VVL timeline rules. + if (timelineValue > renderer->GetCurrentTimelineValue()) { + renderer->SignalFrameTimeline(timelineValue); + } + } + static bool noFallbackLogged = false; + if (!noFallbackLogged) { + noFallbackLogged = true; + } + } + + // Clean up rigid bodies marked for removal (happens regardless of GPU/CPU physics path) + CleanupMarkedBodies(); +} + +void PhysicsSystem::EnqueueRigidBodyCreation(Entity* entity, + CollisionShape shape, + float mass, + bool kinematic, + float restitution, + float friction) { + if (!entity) + return; + std::lock_guard lk(pendingMutex); + pendingCreations.push_back(PendingCreation{entity, shape, mass, kinematic, restitution, friction}); +} + +RigidBody* PhysicsSystem::CreateRigidBody(Entity* entity, CollisionShape shape, float mass) { + // Create a new rigid body + auto rigidBody = std::make_unique(entity, shape, mass); + + // Store the rigid body with thread-safe access + std::lock_guard lock(rigidBodiesMutex); + rigidBodies.push_back(std::move(rigidBody)); + needsBroadPhase = true; + + return rigidBodies.back().get(); +} + +bool PhysicsSystem::DestroyRigidBody(RigidBody* rigidBody) { + std::lock_guard lock(rigidBodiesMutex); + + // Find the rigid body in the vector + auto it = std::ranges::find_if(rigidBodies, + [rigidBody](const std::unique_ptr& rb) { + return rb.get() == rigidBody; + }); + + if (it != rigidBodies.end()) { + // Remove the rigid body + rigidBodies.erase(it); + + return true; + } + + std::cerr << "PhysicsSystem::DestroyRigidBody: Rigid body not found" << std::endl; + return false; +} + +void PhysicsSystem::SetGravity(const glm::vec3& _gravity) { + gravity = _gravity; +} + +glm::vec3 PhysicsSystem::GetGravity() const { + return gravity; +} + +bool PhysicsSystem::Raycast(const glm::vec3& origin, + const glm::vec3& direction, + float maxDistance, + glm::vec3* hitPosition, + glm::vec3* hitNormal, + Entity** hitEntity) const { + // Normalize the direction vector + glm::vec3 normalizedDirection = glm::normalize(direction); + + // Variables to track the closest hit + float closestHitDistance = maxDistance; + bool hitFound = false; + glm::vec3 closestHitPosition; + glm::vec3 closestHitNormal; + Entity* closestHitEntity = nullptr; + + // Protect access to rigidBodies vector during iteration + std::lock_guard lock(rigidBodiesMutex); + + // Check each rigid body for intersection + for (const auto& rigidBody : rigidBodies) { + auto concreteRigidBody = dynamic_cast(rigidBody.get()); + Entity* entity = concreteRigidBody->GetEntity(); + + // Skip if the entity is null + if (!entity) { + continue; + } + + // Get the position and shape of the rigid body + glm::vec3 position = concreteRigidBody->GetPosition(); + CollisionShape shape = concreteRigidBody->GetShape(); + + // Variables for hit detection + float hitDistance = 0.0f; + glm::vec3 localHitPosition; + glm::vec3 localHitNormal; + bool hit = false; + + // Check for intersection based on the shape + switch (shape) { + case CollisionShape::Sphere: { + // Sphere intersection test + float radius = TENNIS_BALL_RADIUS; + + // Calculate coefficients for quadratic equation + glm::vec3 oc = origin - position; + float a = glm::dot(normalizedDirection, normalizedDirection); + float b = 2.0f * glm::dot(oc, normalizedDirection); + float c = glm::dot(oc, oc) - radius * radius; + float discriminant = b * b - 4 * a * c; + + if (discriminant >= 0) { + // Calculate intersection distance + float t = (-b - std::sqrt(discriminant)) / (2.0f * a); + + // Check if the intersection is within range + if (t > 0 && t < closestHitDistance) { + hitDistance = t; + localHitPosition = origin + normalizedDirection * t; + localHitNormal = glm::normalize(localHitPosition - position); + hit = true; + } + } + break; + } + case CollisionShape::Box: { + // Box intersection test (AABB) + glm::vec3 halfExtents(0.5f, 0.5f, 0.5f); // Default box size + + // Calculate min and max bounds of the box + glm::vec3 boxMin = position - halfExtents; + glm::vec3 boxMax = position + halfExtents; + + // Calculate intersection with each slab + float tmin = -INFINITY, tmax = INFINITY; + + for (int i = 0; i < 3; i++) { + if (std::abs(normalizedDirection[i]) < 0.0001f) { + // Ray is parallel to the slab, check if origin is within slab + if (origin[i] < boxMin[i] || origin[i] > boxMax[i]) { + // No intersection + hit = false; + break; + } + } else { + // Calculate intersection distances + float ood = 1.0f / normalizedDirection[i]; + float t1 = (boxMin[i] - origin[i]) * ood; + float t2 = (boxMax[i] - origin[i]) * ood; + + // Ensure t1 <= t2 + if (t1 > t2) { + std::swap(t1, t2); + } + + // Update tmin and tmax + tmin = std::max(tmin, t1); + tmax = std::min(tmax, t2); + + if (tmin > tmax) { + // No intersection + hit = false; + break; + } + } + } + + // Check if the intersection is within range + if (tmin > 0 && tmin < closestHitDistance) { + hitDistance = tmin; + localHitPosition = origin + normalizedDirection * tmin; + + // Calculate normal based on which face was hit + glm::vec3 center = position; + glm::vec3 d = localHitPosition - center; + float bias = 1.00001f; // Small bias to ensure we get the correct face + + localHitNormal = glm::vec3(0.0f); + if (d.x > halfExtents.x * bias) + localHitNormal = glm::vec3(1, 0, 0); + else if (d.x < -halfExtents.x * bias) + localHitNormal = glm::vec3(-1, 0, 0); + else if (d.y > halfExtents.y * bias) + localHitNormal = glm::vec3(0, 1, 0); + else if (d.y < -halfExtents.y * bias) + localHitNormal = glm::vec3(0, -1, 0); + else if (d.z > halfExtents.z * bias) + localHitNormal = glm::vec3(0, 0, 1); + else if (d.z < -halfExtents.z * bias) + localHitNormal = glm::vec3(0, 0, -1); + + hit = true; + } + break; + } + case CollisionShape::Capsule: { + // Capsule intersection test + // Simplified as a line segment with spheres at each end + float radius = 0.5f; // Default radius + float halfHeight = 0.5f; // Default half-height + + // Define capsule line segment + glm::vec3 capsuleA = position + glm::vec3(0, -halfHeight, 0); + glm::vec3 capsuleB = position + glm::vec3(0, halfHeight, 0); + + // Calculate the closest point on a line segment + glm::vec3 ab = capsuleB - capsuleA; + glm::vec3 ao = origin - capsuleA; + + float t = glm::dot(ao, ab) / glm::dot(ab, ab); + t = glm::clamp(t, 0.0f, 1.0f); + + glm::vec3 closestPoint = capsuleA + ab * t; + + // Sphere intersection test with the closest point + glm::vec3 oc = origin - closestPoint; + float a = glm::dot(normalizedDirection, normalizedDirection); + float b = 2.0f * glm::dot(oc, normalizedDirection); + float c = glm::dot(oc, oc) - radius * radius; + + if (float discriminant = b * b - 4 * a * c; discriminant >= 0) { + // Calculate intersection distance + + // Check if the intersection is within range + if (float id = (-b - std::sqrt(discriminant)) / (2.0f * a); id > 0 && id < closestHitDistance) { + hitDistance = id; + localHitPosition = origin + normalizedDirection * id; + localHitNormal = glm::normalize(localHitPosition - closestPoint); + hit = true; + } + } + break; + } + case CollisionShape::Mesh: { + // Proper mesh intersection test using triangle data + if (auto* meshComponent = entity->GetComponent()) { + const auto& vertices = meshComponent->GetVertices(); + const auto& indices = meshComponent->GetIndices(); + + // Test intersection with each triangle in the mesh + for (size_t i = 0; i < indices.size(); i += 3) { + if (i + 2 >= indices.size()) + break; + + // Get triangle vertices + glm::vec3 v0 = vertices[indices[i]].position; + glm::vec3 v1 = vertices[indices[i + 1]].position; + glm::vec3 v2 = vertices[indices[i + 2]].position; + + // Transform vertices to world space + if (auto* transform = entity->GetComponent()) { + glm::mat4 transformMatrix = transform->GetModelMatrix(); + v0 = glm::vec3(transformMatrix * glm::vec4(v0, 1.0f)); + v1 = glm::vec3(transformMatrix * glm::vec4(v1, 1.0f)); + v2 = glm::vec3(transformMatrix * glm::vec4(v2, 1.0f)); + } + + // Ray-triangle intersection using Möller-Trumbore algorithm + glm::vec3 edge1 = v1 - v0; + glm::vec3 edge2 = v2 - v0; + glm::vec3 h = glm::cross(normalizedDirection, edge2); + float a = glm::dot(edge1, h); + + if (a > -0.00001f && a < 0.00001f) + continue; // Ray parallel to triangle + + float f = 1.0f / a; + glm::vec3 s = origin - v0; + float u = f * glm::dot(s, h); + + if (u < 0.0f || u > 1.0f) + continue; + + glm::vec3 q = glm::cross(s, edge1); + float v = f * glm::dot(normalizedDirection, q); + + if (v < 0.0f || u + v > 1.0f) + continue; + + float t = f * glm::dot(edge2, q); + + if (t > 0.00001f && t < closestHitDistance) { + hitDistance = t; + localHitPosition = origin + normalizedDirection * t; + localHitNormal = glm::normalize(glm::cross(edge1, edge2)); + hit = true; + closestHitDistance = t; // Update for closer triangles + } + } + } + break; + } + default: + break; + } + + // Update the closest hit if a hit was found + if (hit && hitDistance < closestHitDistance) { + closestHitDistance = hitDistance; + closestHitPosition = localHitPosition; + closestHitNormal = localHitNormal; + closestHitEntity = entity; + hitFound = true; + } + } + + // Set output parameters if a hit was found + if (hitFound) { + if (hitPosition) { + *hitPosition = closestHitPosition; + } + + if (hitNormal) { + *hitNormal = closestHitNormal; + } + + if (hitEntity) { + *hitEntity = closestHitEntity; + } + } + + return hitFound; +} + +// Helper function to read a shader file +static std::vector readFile(const std::string& filename) { + std::vector searchPaths = { + filename, + "cmake-build-debug/" + filename, + "cmake-build-release/" + filename, + "build/" + filename, + "../simple_engine/" + filename, + "../simple_engine/cmake-build-debug/" + filename, + "../" + filename, + "../../" + filename + }; + + for (const auto& path : searchPaths) { + std::ifstream file(path, std::ios::ate | std::ios::binary); + if (file.is_open()) { + size_t fileSize = file.tellg(); + std::vector buffer(fileSize); + file.seekg(0); + file.read(buffer.data(), static_cast(fileSize)); + file.close(); + return buffer; + } + } + + std::cerr << "CRITICAL: physics_system::Failed to open " << filename << " in any of the following paths:" << std::endl; + for (const auto& path : searchPaths) { + std::cerr << " - " << path << std::endl; + } + + throw std::runtime_error("Failed to open file in any search path: " + filename); +} + +// Helper function to create a shader module +static vk::raii::ShaderModule createShaderModule(const vk::raii::Device& device, const std::vector& code) { + vk::ShaderModuleCreateInfo createInfo; + createInfo.codeSize = code.size(); + createInfo.pCode = reinterpret_cast(code.data()); + + return {device, createInfo}; +} + +bool PhysicsSystem::InitializeVulkanResources() { + if (!renderer) { + std::cerr << "Renderer is not set" << std::endl; + return false; + } + + vk::Device device = renderer->GetDevice(); + if (!device) { + std::cerr << "Vulkan device is not valid" << std::endl; + return false; + } + + try { + // Create shader modules + const vk::raii::Device& raiiDevice = renderer->GetRaiiDevice(); + + // Load physics shader once and reuse for all compute pipelines + std::vector physicsShaderCode = readFile("shaders/physics.spv"); + vulkanResources.integrateShaderModule = createShaderModule(raiiDevice, physicsShaderCode); + vulkanResources.broadPhaseShaderModule = createShaderModule(raiiDevice, physicsShaderCode); + vulkanResources.narrowPhaseShaderModule = createShaderModule(raiiDevice, physicsShaderCode); + vulkanResources.resolveShaderModule = createShaderModule(raiiDevice, physicsShaderCode); + + // Create a descriptor set layout + std::array bindings = { + // Physics data buffer + vk::DescriptorSetLayoutBinding( + 0, + // binding + vk::DescriptorType::eStorageBuffer, + // descriptorType + 1, + // descriptorCount + vk::ShaderStageFlagBits::eCompute, + // stageFlags + nullptr // pImmutableSamplers + ), + // Collision data buffer + vk::DescriptorSetLayoutBinding( + 1, + // binding + vk::DescriptorType::eStorageBuffer, + // descriptorType + 1, + // descriptorCount + vk::ShaderStageFlagBits::eCompute, + // stageFlags + nullptr // pImmutableSamplers + ), + // Pair buffer + vk::DescriptorSetLayoutBinding( + 2, + // binding + vk::DescriptorType::eStorageBuffer, + // descriptorType + 1, + // descriptorCount + vk::ShaderStageFlagBits::eCompute, + // stageFlags + nullptr // pImmutableSamplers + ), + // Counter buffer + vk::DescriptorSetLayoutBinding( + 3, + // binding + vk::DescriptorType::eStorageBuffer, + // descriptorType + 1, + // descriptorCount + vk::ShaderStageFlagBits::eCompute, + // stageFlags + nullptr // pImmutableSamplers + ), + // Parameters buffer + vk::DescriptorSetLayoutBinding( + 4, + // binding + vk::DescriptorType::eUniformBuffer, + // descriptorType + 1, + // descriptorCount + vk::ShaderStageFlagBits::eCompute, + // stageFlags + nullptr // pImmutableSamplers + ) + }; + + vk::DescriptorSetLayoutCreateInfo layoutInfo; + layoutInfo.bindingCount = static_cast(bindings.size()); + layoutInfo.pBindings = bindings.data(); + vulkanResources.descriptorSetLayout = vk::raii::DescriptorSetLayout(raiiDevice, layoutInfo); + + // Create pipeline layout + vk::PipelineLayoutCreateInfo pipelineLayoutInfo; + pipelineLayoutInfo.setLayoutCount = 1; + vk::DescriptorSetLayout descriptorSetLayout = *vulkanResources.descriptorSetLayout; + pipelineLayoutInfo.pSetLayouts = &descriptorSetLayout; + vulkanResources.pipelineLayout = vk::raii::PipelineLayout(raiiDevice, pipelineLayoutInfo); + + // Create compute pipelines + vk::ComputePipelineCreateInfo pipelineInfo; + pipelineInfo.layout = *vulkanResources.pipelineLayout; + pipelineInfo.basePipelineHandle = nullptr; + + // Integrate pipeline + vk::PipelineShaderStageCreateInfo integrateStageInfo; + integrateStageInfo.stage = vk::ShaderStageFlagBits::eCompute; + integrateStageInfo.module = *vulkanResources.integrateShaderModule; + integrateStageInfo.pName = "IntegrateCS"; + pipelineInfo.stage = integrateStageInfo; + vulkanResources.integratePipeline = vk::raii::Pipeline(raiiDevice, nullptr, pipelineInfo); + + // Broad phase pipeline + vk::PipelineShaderStageCreateInfo broadPhaseStageInfo; + broadPhaseStageInfo.stage = vk::ShaderStageFlagBits::eCompute; + broadPhaseStageInfo.module = *vulkanResources.broadPhaseShaderModule; + broadPhaseStageInfo.pName = "BroadPhaseCS"; + pipelineInfo.stage = broadPhaseStageInfo; + vulkanResources.broadPhasePipeline = vk::raii::Pipeline(raiiDevice, nullptr, pipelineInfo); + + // Narrow phase pipeline + vk::PipelineShaderStageCreateInfo narrowPhaseStageInfo; + narrowPhaseStageInfo.stage = vk::ShaderStageFlagBits::eCompute; + narrowPhaseStageInfo.module = *vulkanResources.narrowPhaseShaderModule; + narrowPhaseStageInfo.pName = "NarrowPhaseCS"; + pipelineInfo.stage = narrowPhaseStageInfo; + vulkanResources.narrowPhasePipeline = vk::raii::Pipeline(raiiDevice, nullptr, pipelineInfo); + + // Resolve pipeline + vk::PipelineShaderStageCreateInfo resolveStageInfo; + resolveStageInfo.stage = vk::ShaderStageFlagBits::eCompute; + resolveStageInfo.module = *vulkanResources.resolveShaderModule; + resolveStageInfo.pName = "ResolveCS"; + pipelineInfo.stage = resolveStageInfo; + vulkanResources.resolvePipeline = vk::raii::Pipeline(raiiDevice, nullptr, pipelineInfo); + + // Create buffers + vk::DeviceSize physicsBufferSize = sizeof(GPUPhysicsData) * maxGPUObjects; + vk::DeviceSize collisionBufferSize = sizeof(GPUCollisionData) * maxGPUCollisions; + vk::DeviceSize pairBufferSize = sizeof(uint32_t) * 2 * maxGPUCollisions; + vk::DeviceSize counterBufferSize = sizeof(uint32_t) * 2; + vk::DeviceSize paramsBufferSize = ((sizeof(PhysicsParams) + 63) / 64) * 64; + + // Create a physics buffer + CreateMappedBuffer(physicsBufferSize, + vk::BufferUsageFlagBits::eStorageBuffer, + vulkanResources.physicsBuffer, + vulkanResources.physicsBufferMemory, + "Failed to create physics buffer: "); + + // Create a collision buffer + CreateMappedBuffer(collisionBufferSize, + vk::BufferUsageFlagBits::eStorageBuffer, + vulkanResources.collisionBuffer, + vulkanResources.collisionBufferMemory, + "Failed to create collision buffer: "); + + // Create a pair buffer + CreateMappedBuffer(pairBufferSize, + vk::BufferUsageFlagBits::eStorageBuffer, + vulkanResources.pairBuffer, + vulkanResources.pairBufferMemory, + "Failed to create pair buffer: "); + + // Create the counter-buffer + CreateMappedBuffer(counterBufferSize, + vk::BufferUsageFlagBits::eStorageBuffer, + vulkanResources.counterBuffer, + vulkanResources.counterBufferMemory, + "Failed to create counter buffer: "); + + // Create a params buffer + CreateMappedBuffer(paramsBufferSize, + vk::BufferUsageFlagBits::eUniformBuffer, + vulkanResources.paramsBuffer, + vulkanResources.paramsBufferMemory, + "Failed to create params buffer: "); + + // Create persistent mapped memory pointers for improved performance + try { + // Map entire memory objects persistently to satisfy VK_WHOLE_SIZE flush alignment requirements + vulkanResources.persistentPhysicsMemory = vulkanResources.physicsBufferMemory.mapMemory(0, VK_WHOLE_SIZE); + vulkanResources.persistentCounterMemory = vulkanResources.counterBufferMemory.mapMemory(0, VK_WHOLE_SIZE); + vulkanResources.persistentParamsMemory = vulkanResources.paramsBufferMemory.mapMemory(0, VK_WHOLE_SIZE); + } catch (const std::exception& e) { + throw std::runtime_error("Failed to create persistent mapped memory: " + std::string(e.what())); + } + + // Initialize counter-buffer using persistent memory + uint32_t initialCounters[2] = {0, 0}; // [0] = pair count, [1] = collision count + memcpy(vulkanResources.persistentCounterMemory, initialCounters, sizeof(initialCounters)); + + // Create a descriptor pool with capacity for 4 physics stages + std::array poolSizes = { + vk::DescriptorPoolSize(vk::DescriptorType::eStorageBuffer, 16), // 4 storage buffers × 4 stages + vk::DescriptorPoolSize(vk::DescriptorType::eUniformBuffer, 4) // 1 uniform buffer × 4 stages + }; + + vk::DescriptorPoolCreateInfo poolInfo; + poolInfo.flags = vk::DescriptorPoolCreateFlagBits::eFreeDescriptorSet; + poolInfo.poolSizeCount = static_cast(poolSizes.size()); + poolInfo.pPoolSizes = poolSizes.data(); + poolInfo.maxSets = 4; // Support 4 descriptor sets for 4 physics stages + vulkanResources.descriptorPool = vk::raii::DescriptorPool(raiiDevice, poolInfo); + + // Allocate descriptor sets + vk::DescriptorSetAllocateInfo descriptorSetAllocInfo; + descriptorSetAllocInfo.descriptorPool = *vulkanResources.descriptorPool; + descriptorSetAllocInfo.descriptorSetCount = 1; + vk::DescriptorSetLayout descriptorSetLayoutRef = *vulkanResources.descriptorSetLayout; + descriptorSetAllocInfo.pSetLayouts = &descriptorSetLayoutRef; + + try { + vulkanResources.descriptorSets = raiiDevice.allocateDescriptorSets(descriptorSetAllocInfo); + } catch (const std::exception& e) { + throw std::runtime_error("Failed to allocate descriptor sets: " + std::string(e.what())); + } + + // Update descriptor sets + vk::DescriptorBufferInfo physicsBufferInfo; + physicsBufferInfo.buffer = *vulkanResources.physicsBuffer; + physicsBufferInfo.offset = 0; + physicsBufferInfo.range = physicsBufferSize; + + vk::DescriptorBufferInfo collisionBufferInfo; + collisionBufferInfo.buffer = *vulkanResources.collisionBuffer; + collisionBufferInfo.offset = 0; + collisionBufferInfo.range = collisionBufferSize; + + vk::DescriptorBufferInfo pairBufferInfo; + pairBufferInfo.buffer = *vulkanResources.pairBuffer; + pairBufferInfo.offset = 0; + pairBufferInfo.range = pairBufferSize; + + vk::DescriptorBufferInfo counterBufferInfo; + counterBufferInfo.buffer = *vulkanResources.counterBuffer; + counterBufferInfo.offset = 0; + counterBufferInfo.range = counterBufferSize; + + vk::DescriptorBufferInfo paramsBufferInfo; + paramsBufferInfo.buffer = *vulkanResources.paramsBuffer; + paramsBufferInfo.offset = 0; + paramsBufferInfo.range = VK_WHOLE_SIZE; // Use VK_WHOLE_SIZE to ensure the entire buffer is accessible + + std::array descriptorWrites; + + // Physics buffer + descriptorWrites[0].setDstSet(*vulkanResources.descriptorSets[0]).setDstBinding(0).setDstArrayElement(0).setDescriptorCount(1).setDescriptorType(vk::DescriptorType::eStorageBuffer).setPBufferInfo(&physicsBufferInfo); + + // Collision buffer + descriptorWrites[1].setDstSet(*vulkanResources.descriptorSets[0]).setDstBinding(1).setDstArrayElement(0).setDescriptorCount(1).setDescriptorType(vk::DescriptorType::eStorageBuffer).setPBufferInfo(&collisionBufferInfo); + + // Pair buffer + descriptorWrites[2].setDstSet(*vulkanResources.descriptorSets[0]).setDstBinding(2).setDstArrayElement(0).setDescriptorCount(1).setDescriptorType(vk::DescriptorType::eStorageBuffer).setPBufferInfo(&pairBufferInfo); + + // Counter buffer + descriptorWrites[3].setDstSet(*vulkanResources.descriptorSets[0]).setDstBinding(3).setDstArrayElement(0).setDescriptorCount(1).setDescriptorType(vk::DescriptorType::eStorageBuffer).setPBufferInfo(&counterBufferInfo); + + // Params buffer + descriptorWrites[4].setDstSet(*vulkanResources.descriptorSets[0]).setDstBinding(4).setDstArrayElement(0).setDescriptorCount(1).setDescriptorType(vk::DescriptorType::eUniformBuffer).setPBufferInfo(¶msBufferInfo); + + raiiDevice.updateDescriptorSets(descriptorWrites, nullptr); + + // Create a command pool bound to the compute queue family used by the renderer + vk::CommandPoolCreateInfo commandPoolInfo; + commandPoolInfo.flags = vk::CommandPoolCreateFlagBits::eResetCommandBuffer; + commandPoolInfo.queueFamilyIndex = renderer->GetComputeQueueFamilyIndex(); + vulkanResources.commandPool = vk::raii::CommandPool(raiiDevice, commandPoolInfo); + + // Allocate command buffers + vk::CommandBufferAllocateInfo commandBufferInfo; + commandBufferInfo.commandPool = *vulkanResources.commandPool; + commandBufferInfo.level = vk::CommandBufferLevel::ePrimary; + commandBufferInfo.commandBufferCount = static_cast(renderer->GetMaxFramesInFlight()); + + try { + vulkanResources.commandBuffers = raiiDevice.allocateCommandBuffers(commandBufferInfo); + } catch (const std::exception& e) { + throw std::runtime_error("Failed to allocate command buffers: " + std::string(e.what())); + } + + // Create a dedicated fence for compute synchronization + vk::FenceCreateInfo fenceInfo{}; + vulkanResources.computeFence = vk::raii::Fence(raiiDevice, fenceInfo); + + return true; + } catch (const std::exception& e) { + std::cerr << "Error initializing Vulkan resources: " << e.what() << std::endl; + CleanupVulkanResources(); + return false; + } +} + +void PhysicsSystem::CleanupVulkanResources() { + if (!renderer) { + return; + } + + // Wait for the device to be idle before cleaning up + renderer->WaitIdle(); + + // Cleanup in proper order to avoid validation errors + // 1. Clear descriptor sets BEFORE destroying the descriptor pool + vulkanResources.descriptorSets.clear(); + + // 2. Destroy pipelines before pipeline layout + vulkanResources.resolvePipeline = nullptr; + vulkanResources.narrowPhasePipeline = nullptr; + vulkanResources.broadPhasePipeline = nullptr; + vulkanResources.integratePipeline = nullptr; + + // 3. Destroy pipeline layout before descriptor set layout + vulkanResources.pipelineLayout = nullptr; + vulkanResources.descriptorSetLayout = nullptr; + + // 4. Destroy shader modules + vulkanResources.resolveShaderModule = nullptr; + vulkanResources.narrowPhaseShaderModule = nullptr; + vulkanResources.broadPhaseShaderModule = nullptr; + vulkanResources.integrateShaderModule = nullptr; + + // 5. Destroy the descriptor pool after descriptor sets are cleared + vulkanResources.descriptorPool = nullptr; + + // 6. Destroy the command buffers before the command pool + vulkanResources.commandBuffers.clear(); + vulkanResources.commandPool = nullptr; + + // 7. Destroy compute fence + vulkanResources.computeFence = nullptr; + + // 8. Unmap persistent memory pointers before destroying buffer memory + if (vulkanResources.persistentPhysicsMemory && *vulkanResources.physicsBufferMemory) { + vulkanResources.physicsBufferMemory.unmapMemory(); + vulkanResources.persistentPhysicsMemory = nullptr; + } + + if (vulkanResources.persistentCounterMemory && *vulkanResources.counterBufferMemory) { + vulkanResources.counterBufferMemory.unmapMemory(); + vulkanResources.persistentCounterMemory = nullptr; + } + + if (vulkanResources.persistentParamsMemory && *vulkanResources.paramsBufferMemory) { + vulkanResources.paramsBufferMemory.unmapMemory(); + vulkanResources.persistentParamsMemory = nullptr; + } + + // 8. Destroy buffers and their memory + vulkanResources.paramsBuffer = nullptr; + vulkanResources.paramsBufferMemory = nullptr; + vulkanResources.counterBuffer = nullptr; + vulkanResources.counterBufferMemory = nullptr; + vulkanResources.pairBuffer = nullptr; + vulkanResources.pairBufferMemory = nullptr; + vulkanResources.collisionBuffer = nullptr; + vulkanResources.collisionBufferMemory = nullptr; + vulkanResources.physicsBuffer = nullptr; + vulkanResources.physicsBufferMemory = nullptr; +} + +void PhysicsSystem::UpdateGPUPhysicsData(std::chrono::milliseconds deltaTime) const { + if (!renderer) { + return; + } + + // Validate Vulkan resources and persistent memory pointers before using them + if (!*vulkanResources.physicsBuffer || !*vulkanResources.physicsBufferMemory || + !*vulkanResources.counterBuffer || !*vulkanResources.counterBufferMemory || + !*vulkanResources.paramsBuffer || !*vulkanResources.paramsBufferMemory || + !vulkanResources.persistentPhysicsMemory || !vulkanResources.persistentCounterMemory || !vulkanResources.persistentParamsMemory) { + std::cerr << "PhysicsSystem::UpdateGPUPhysicsData: Invalid Vulkan resources or persistent memory pointers" << std::endl; + return; + } + + // Skip physics buffer operations if no rigid bodies exist + uint32_t dynamicCount = 0; + if (!rigidBodies.empty()) { + // Use persistent mapped memory for physics buffer + auto* gpuData = static_cast(vulkanResources.persistentPhysicsMemory); + const size_t count = std::min(rigidBodies.size(), static_cast(maxGPUObjects)); + for (size_t i = 0; i < count; i++) { + auto* concreteRigidBody = dynamic_cast(rigidBodies[i].get()); + if (!concreteRigidBody) { + continue; + } + + const bool isKinematic = concreteRigidBody->IsKinematic(); + if (!isKinematic) { + dynamicCount++; + } + + // Skip GPU updates for static (kinematic) bodies that haven't changed. + // Dynamic bodies (like tennis balls) always need their state synchronized. + if (isKinematic && !concreteRigidBody->gpuNeedsUpdate) { + continue; + } + concreteRigidBody->gpuNeedsUpdate = false; + + gpuData[i].position = glm::vec4(concreteRigidBody->GetPosition(), concreteRigidBody->GetInverseMass()); + gpuData[i].rotation = glm::vec4(concreteRigidBody->GetRotation().x, + concreteRigidBody->GetRotation().y, + concreteRigidBody->GetRotation().z, + concreteRigidBody->GetRotation().w); + gpuData[i].linearVelocity = glm::vec4(concreteRigidBody->GetLinearVelocity(), concreteRigidBody->GetRestitution()); + gpuData[i].angularVelocity = glm::vec4(concreteRigidBody->GetAngularVelocity(), concreteRigidBody->GetFriction()); + // CRITICAL FIX: Initialize forces properly instead of always resetting to zero + // For balls, we want to start with zero force and let the shader apply gravity + // For static geometry, forces should remain zero + auto initialForce = glm::vec3(0.0f); + auto initialTorque = glm::vec3(0.0f); + + // The shader will add gravity and other forces each frame + gpuData[i].force = glm::vec4(initialForce, isKinematic ? 1.0f : 0.0f); + // Use gravity only for dynamic bodies + gpuData[i].torque = glm::vec4(initialTorque, isKinematic ? 0.0f : 1.0f); + + // Set collider data based on a collider type + switch (concreteRigidBody->GetShape()) { + case CollisionShape::Sphere: + // Use tennis ball radius instead of hardcoded 0.5f + gpuData[i].colliderData = glm::vec4(TENNIS_BALL_RADIUS, 0.0f, 0.0f, static_cast(0)); // 0 = Sphere + gpuData[i].colliderData2 = glm::vec4(0.0f); + break; + case CollisionShape::Box: + gpuData[i].colliderData = glm::vec4(0.5f, 0.5f, 0.5f, static_cast(1)); // 1 = Box + gpuData[i].colliderData2 = glm::vec4(0.0f); + break; + case CollisionShape::Mesh: { + // Compute an axis-aligned bounding box from the entity's mesh in WORLD space + // and pass half-extents and local offset to the GPU. This enables sphere-geometry + // collisions against actual imported GLTF geometry rather than a constant box. + glm::vec3 halfExtents(5.0f); + glm::vec3 localOffset(0.0f); + + auto* entity = concreteRigidBody->GetEntity(); + if (entity) { + auto* meshComp = entity->GetComponent(); + auto* xform = entity->GetComponent(); + if (meshComp && xform && meshComp->HasLocalAABB()) { + glm::vec3 localMin = meshComp->GetLocalAABBMin(); + glm::vec3 localMax = meshComp->GetLocalAABBMax(); + glm::vec3 localCenter = 0.5f * (localMin + localMax); + glm::vec3 localHalfExtents = 0.5f * (localMax - localMin); + + glm::mat4 model = (meshComp->GetInstanceCount() > 0) ? meshComp->GetInstance(0).getModelMatrix() : xform->GetModelMatrix(); + glm::vec3 centerWS = glm::vec3(model * glm::vec4(localCenter, 1.0f)); + + glm::mat3 RS = glm::mat3(model); + glm::mat3 absRS; + absRS[0] = glm::abs(RS[0]); + absRS[1] = glm::abs(RS[1]); + absRS[2] = glm::abs(RS[2]); + + glm::vec3 worldHalfExtents = absRS * localHalfExtents; + halfExtents = glm::max(worldHalfExtents, glm::vec3(0.01f)); + + // Offset relative to rigid body position + localOffset = centerWS - concreteRigidBody->GetPosition(); + } + } + + // Encode Mesh collider as Mesh (type=2) for GPU narrowphase handling (sphere vs mesh) + gpuData[i].colliderData = glm::vec4(halfExtents, static_cast(2)); // 2 = Mesh (represented as world AABB) + gpuData[i].colliderData2 = glm::vec4(localOffset, 0.0f); + } + break; + default: + gpuData[i].colliderData = glm::vec4(0.0f, 0.0f, 0.0f, -1.0f); // Invalid + gpuData[i].colliderData2 = glm::vec4(0.0f); + break; + } + } + } + hasDynamicBodies = (dynamicCount > 0); + + // Reset counters using persistent mapped memory + uint32_t initialCounters[2] = {0, 0}; // [0] = pair count, [1] = collision count + memcpy(vulkanResources.persistentCounterMemory, initialCounters, sizeof(initialCounters)); + + // Update params buffer + PhysicsParams params{}; + params.deltaTime = deltaTime.count() * 0.001f; // Use actual deltaTime instead of fixed timestep + params.numBodies = static_cast(std::min(rigidBodies.size(), static_cast(maxGPUObjects))); + params.maxCollisions = maxGPUCollisions; + params.padding = 0.0f; // Initialize padding to zero for proper std140 alignment + params.gravity = glm::vec4(gravity, 0.0f); // Pack gravity into vec4 with padding + + // Update params buffer using persistent mapped memory + memcpy(vulkanResources.persistentParamsMemory, ¶ms, sizeof(PhysicsParams)); + + // CRITICAL FIX: Explicit memory flush to ensure HOST_COHERENT memory is fully visible to GPU + // Even with HOST_COHERENT flag, some systems may have cache coherency issues with partial writes + // Use VK_WHOLE_SIZE to avoid nonCoherentAtomSize alignment validation errors + try { + const vk::raii::Device& device = renderer->GetRaiiDevice(); + // Flush params buffer + vk::MappedMemoryRange flushRangeParams; + flushRangeParams.memory = *vulkanResources.paramsBufferMemory; + flushRangeParams.offset = 0; + flushRangeParams.size = VK_WHOLE_SIZE; + device.flushMappedMemoryRanges(flushRangeParams); + // Flush physics buffer (object data) + vk::MappedMemoryRange flushRangePhysics; + flushRangePhysics.memory = *vulkanResources.physicsBufferMemory; + flushRangePhysics.offset = 0; + flushRangePhysics.size = VK_WHOLE_SIZE; + device.flushMappedMemoryRanges(flushRangePhysics); + // Flush counter buffer (pair and collision counters) + vk::MappedMemoryRange flushRangeCounter; + flushRangeCounter.memory = *vulkanResources.counterBufferMemory; + flushRangeCounter.offset = 0; + flushRangeCounter.size = VK_WHOLE_SIZE; + device.flushMappedMemoryRanges(flushRangeCounter); + } catch (const std::exception& e) { + fprintf(stderr, "WARNING: Failed to flush mapped physics memory: %s", e.what()); + } +} + +void PhysicsSystem::ReadbackGPUPhysicsData() const { + if (!renderer) { + return; + } + + // Validate Vulkan resources and persistent memory pointers before using them + if (!*vulkanResources.physicsBuffer || !*vulkanResources.physicsBufferMemory || + !vulkanResources.persistentPhysicsMemory) { + return; + } + + // Wait for a dedicated compute fence to ensure GPU compute operations are complete before reading back data + const vk::raii::Device& device = renderer->GetRaiiDevice(); + vk::Result result = device.waitForFences(*vulkanResources.computeFence, VK_TRUE, UINT64_MAX); + if (result != vk::Result::eSuccess) { + return; + } + + // Ensure GPU writes to HOST_VISIBLE memory are visible to the host before reading + try { + vk::MappedMemoryRange invalidateRangePhysics; + invalidateRangePhysics.memory = *vulkanResources.physicsBufferMemory; + invalidateRangePhysics.offset = 0; + invalidateRangePhysics.size = VK_WHOLE_SIZE; + + vk::MappedMemoryRange invalidateRangeCounter; + invalidateRangeCounter.memory = *vulkanResources.counterBufferMemory; + invalidateRangeCounter.offset = 0; + invalidateRangeCounter.size = VK_WHOLE_SIZE; + + device.invalidateMappedMemoryRanges({invalidateRangePhysics, invalidateRangeCounter}); + } catch (const std::exception&) { + // On HOST_COHERENT heaps this may not be required; ignore errors + } + + // Optional debug: read and log pair/collision counters for a few frames + if (vulkanResources.persistentCounterMemory) { + static uint32_t lastPairCount = UINT32_MAX; + static uint32_t lastCollisionCount = UINT32_MAX; + const uint32_t* counters = static_cast(vulkanResources.persistentCounterMemory); + uint32_t pairCount = counters[0]; + uint32_t collisionCount = counters[1]; + if (pairCount != lastPairCount || collisionCount != lastCollisionCount) { + // std::cout << "Physics GPU counters - pairs: " << pairCount << ", collisions: " << collisionCount << std::endl; + lastPairCount = pairCount; + lastCollisionCount = collisionCount; + } + } + + // Skip physics buffer operations if no rigid bodies exist + if (!rigidBodies.empty()) { + // Use persistent mapped memory for physics buffer readback + const auto* gpuData = static_cast(vulkanResources.persistentPhysicsMemory); + const size_t count = std::min(rigidBodies.size(), static_cast(maxGPUObjects)); + for (size_t i = 0; i < count; i++) { + const auto concreteRigidBody = dynamic_cast(rigidBodies[i].get()); + if (!concreteRigidBody) { + continue; + } + + // Skip kinematic bodies + if (concreteRigidBody->IsKinematic()) { + continue; + } + + auto newPosition = glm::vec3(gpuData[i].position); + auto newVelocity = glm::vec3(gpuData[i].linearVelocity); + + concreteRigidBody->SetPosition(newPosition); + concreteRigidBody->SetRotation(glm::quat(gpuData[i].rotation.w, + gpuData[i].rotation.x, + gpuData[i].rotation.y, + gpuData[i].rotation.z)); + concreteRigidBody->SetLinearVelocity(newVelocity); + concreteRigidBody->SetAngularVelocity(glm::vec3(gpuData[i].angularVelocity)); + } + } +} + +void PhysicsSystem::SimulatePhysicsOnGPU(const std::chrono::milliseconds deltaTime, uint64_t timelineValue, uint32_t frameIndex) const { + if (!renderer) { + fprintf(stderr, "SimulatePhysicsOnGPU: No renderer available"); + return; + } + + const uint32_t numFrames = static_cast(vulkanResources.commandBuffers.size()); + if (numFrames == 0) return; + uint32_t actualFrameIndex = frameIndex % numFrames; + + // Validate Vulkan resources before using them + if (!*vulkanResources.broadPhasePipeline || !*vulkanResources.narrowPhasePipeline || + !*vulkanResources.integratePipeline || !*vulkanResources.pipelineLayout || + vulkanResources.descriptorSets.empty() || !*vulkanResources.physicsBuffer || + !*vulkanResources.counterBuffer || !*vulkanResources.paramsBuffer) { + return; + } + + // Update physics data on the GPU + UpdateGPUPhysicsData(deltaTime); + + // Reset the command buffer before beginning (required for reuse) + vulkanResources.commandBuffers[actualFrameIndex].reset(); + + // Begin command buffer + vk::CommandBufferBeginInfo beginInfo; + beginInfo.flags = vk::CommandBufferUsageFlagBits::eOneTimeSubmit; + + vulkanResources.commandBuffers[actualFrameIndex].begin(beginInfo); + + vulkanResources.commandBuffers[actualFrameIndex].bindDescriptorSets( + vk::PipelineBindPoint::eCompute, + *vulkanResources.pipelineLayout, + 0, + **vulkanResources.descriptorSets.data(), + nullptr); + + // MODERN BARRIER: Use vk::DependencyInfo and vk::BufferMemoryBarrier2 (Synchronization 2) + // Ensures host-written data is visible to compute shaders. + vk::BufferMemoryBarrier2 hostToDeviceBarrier{ + .srcStageMask = vk::PipelineStageFlagBits2::eHost, + .srcAccessMask = vk::AccessFlagBits2::eHostWrite, + .dstStageMask = vk::PipelineStageFlagBits2::eComputeShader, + .dstAccessMask = vk::AccessFlagBits2::eShaderRead | vk::AccessFlagBits2::eShaderWrite, + .srcQueueFamilyIndex = VK_QUEUE_FAMILY_IGNORED, + .dstQueueFamilyIndex = VK_QUEUE_FAMILY_IGNORED, + .buffer = *vulkanResources.physicsBuffer, + .offset = 0, + .size = VK_WHOLE_SIZE + }; + vk::DependencyInfo depInfo{.bufferMemoryBarrierCount = 1, .pBufferMemoryBarriers = &hostToDeviceBarrier}; + vulkanResources.commandBuffers[actualFrameIndex].pipelineBarrier2(depInfo); + + // Step 1: Integrate forces and velocities + vulkanResources.commandBuffers[actualFrameIndex].bindPipeline(vk::PipelineBindPoint::eCompute, *vulkanResources.integratePipeline); + vulkanResources.commandBuffers[actualFrameIndex].dispatch(static_cast((rigidBodies.size() + 63) / 64), 1, 1); + + // MODERN BARRIER: Ensure integration is complete before collision detection + vk::BufferMemoryBarrier2 computeBarrier{ + .srcStageMask = vk::PipelineStageFlagBits2::eComputeShader, + .srcAccessMask = vk::AccessFlagBits2::eShaderWrite, + .dstStageMask = vk::PipelineStageFlagBits2::eComputeShader, + .dstAccessMask = vk::AccessFlagBits2::eShaderRead, + .srcQueueFamilyIndex = VK_QUEUE_FAMILY_IGNORED, + .dstQueueFamilyIndex = VK_QUEUE_FAMILY_IGNORED, + .buffer = *vulkanResources.physicsBuffer, + .offset = 0, + .size = VK_WHOLE_SIZE + }; + depInfo.pBufferMemoryBarriers = &computeBarrier; + vulkanResources.commandBuffers[actualFrameIndex].pipelineBarrier2(depInfo); + + // Step 2: Broad-phase collision detection + // Optimization: skip broad-phase if no dynamic bodies exist to avoid O(N^2) GPU cost + if (hasDynamicBodies || needsBroadPhase) { + vulkanResources.commandBuffers[actualFrameIndex].bindPipeline(vk::PipelineBindPoint::eCompute, *vulkanResources.broadPhasePipeline); + uint32_t nBodies = static_cast(std::min(rigidBodies.size(), static_cast(maxGPUObjects))); + uint32_t numPairs = (nBodies * (nBodies - 1)) / 2; + uint32_t broadPhaseThreads = (numPairs + 63) / 64; + vulkanResources.commandBuffers[actualFrameIndex].dispatch(std::max(1u, broadPhaseThreads), 1, 1); + needsBroadPhase = false; + } + + // MODERN BARRIER: Broad phase to Narrow phase + vulkanResources.commandBuffers[actualFrameIndex].pipelineBarrier2(depInfo); + + // Step 3: Narrow-phase collision detection + vulkanResources.commandBuffers[actualFrameIndex].bindPipeline(vk::PipelineBindPoint::eCompute, *vulkanResources.narrowPhasePipeline); + uint32_t narrowPhaseThreads = (maxGPUCollisions + 63) / 64; + vulkanResources.commandBuffers[actualFrameIndex].dispatch(narrowPhaseThreads, 1, 1); + + // MODERN BARRIER: Narrow phase to Resolution + vulkanResources.commandBuffers[actualFrameIndex].pipelineBarrier2(depInfo); + + // Step 4: Collision resolution + vulkanResources.commandBuffers[actualFrameIndex].bindPipeline(vk::PipelineBindPoint::eCompute, *vulkanResources.resolvePipeline); + uint32_t resolveThreads = (maxGPUCollisions + 63) / 64; + vulkanResources.commandBuffers[actualFrameIndex].dispatch(resolveThreads, 1, 1); + + // End command buffer + vulkanResources.commandBuffers[actualFrameIndex].end(); + + // Submit the command buffer with timeline semaphore for non-blocking coordination + vk::CommandBuffer cmdBuffer = *vulkanResources.commandBuffers[actualFrameIndex]; + if (timelineValue > 0) { + renderer->SubmitToComputeQueue2(cmdBuffer, renderer->GetFrameTimeline(), timelineValue); + } else { + // Fallback to legacy submit if no timeline provided (should not happen in Sync2Engine) + const vk::raii::Device& device = renderer->GetRaiiDevice(); + device.resetFences(*vulkanResources.computeFence); + renderer->SubmitToComputeQueue(cmdBuffer, *vulkanResources.computeFence); + ReadbackGPUPhysicsData(); // Legacy path blocks on CPU + } +} + +void PhysicsSystem::CleanupMarkedBodies() { + // Remove rigid bodies that are marked for removal + auto it = rigidBodies.begin(); + while (it != rigidBodies.end()) { + auto concreteRigidBody = dynamic_cast(it->get()); + if (concreteRigidBody && concreteRigidBody->markedForRemoval) { + it = rigidBodies.erase(it); + } else { + ++it; + } + } +} diff --git a/attachments/sync2_engine/physics_system.h b/attachments/sync2_engine/physics_system.h new file mode 100644 index 00000000..1142fc1b --- /dev/null +++ b/attachments/sync2_engine/physics_system.h @@ -0,0 +1,454 @@ +/* Copyright (c) 2025 Holochip Corporation + * + * SPDX-License-Identifier: Apache-2.0 + * + * Licensed under the Apache License, Version 2.0 the "License"; + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +#pragma once + +#include +#include +#include +#include +#include +#include +#include + +class Entity; +class Renderer; + +/** + * @brief Enum for different collision shapes. + */ +enum class CollisionShape { + Box, + Sphere, + Capsule, + Mesh +}; + +/** + * @brief Class representing a rigid body for physics simulation. + */ +class RigidBody { + public: + /** + * @brief Default constructor. + */ + RigidBody() = default; + + /** + * @brief Destructor for proper cleanup. + */ + virtual ~RigidBody() = default; + + /** + * @brief Set the position of the rigid body. + * @param position The position. + */ + virtual void SetPosition(const glm::vec3& position) = 0; + + /** + * @brief Set the rotation of the rigid body. + * @param rotation The rotation quaternion. + */ + virtual void SetRotation(const glm::quat& rotation) = 0; + + /** + * @brief Set the scale of the rigid body. + * @param scale The scale. + */ + virtual void SetScale(const glm::vec3& scale) = 0; + + /** + * @brief Set the mass of the rigid body. + * @param mass The mass. + */ + virtual void SetMass(float mass) = 0; + + /** + * @brief Set the restitution (bounciness) of the rigid body. + * @param restitution The restitution (0.0f to 1.0f). + */ + virtual void SetRestitution(float restitution) = 0; + + /** + * @brief Set the friction of the rigid body. + * @param friction The friction (0.0f to 1.0f). + */ + virtual void SetFriction(float friction) = 0; + + /** + * @brief Apply a force to the rigid body. + * @param force The force vector. + * @param localPosition The local position to apply the force at. + */ + virtual void ApplyForce(const glm::vec3& force, const glm::vec3& localPosition = glm::vec3(0.0f)) = 0; + + /** + * @brief Apply an impulse to the rigid body. + * @param impulse The impulse vector. + * @param localPosition The local position to apply the impulse at. + */ + virtual void ApplyImpulse(const glm::vec3& impulse, const glm::vec3& localPosition = glm::vec3(0.0f)) = 0; + + /** + * @brief Set the linear velocity of the rigid body. + * @param velocity The linear velocity. + */ + virtual void SetLinearVelocity(const glm::vec3& velocity) = 0; + + /** + * @brief Set the angular velocity of the rigid body. + * @param velocity The angular velocity. + */ + virtual void SetAngularVelocity(const glm::vec3& velocity) = 0; + + /** + * @brief Get the position of the rigid body. + * @return The position. + */ + [[nodiscard]] virtual glm::vec3 GetPosition() const = 0; + + /** + * @brief Get the rotation of the rigid body. + * @return The rotation quaternion. + */ + [[nodiscard]] virtual glm::quat GetRotation() const = 0; + + /** + * @brief Get the linear velocity of the rigid body. + * @return The linear velocity. + */ + [[nodiscard]] virtual glm::vec3 GetLinearVelocity() const = 0; + + /** + * @brief Get the angular velocity of the rigid body. + * @return The angular velocity. + */ + [[nodiscard]] virtual glm::vec3 GetAngularVelocity() const = 0; + + /** + * @brief Set whether the rigid body is kinematic. + * @param kinematic Whether the rigid body is kinematic. + */ + virtual void SetKinematic(bool kinematic) = 0; + + /** + * @brief Check if the rigid body is kinematic. + * @return True if kinematic, false otherwise. + */ + [[nodiscard]] virtual bool IsKinematic() const = 0; +}; + +/** + * @brief Structure for GPU physics data. + */ +struct GPUPhysicsData { + glm::vec4 position; // xyz = position, w = inverse mass + glm::vec4 rotation; // quaternion + glm::vec4 linearVelocity; // xyz = velocity, w = restitution + glm::vec4 angularVelocity; // xyz = angular velocity, w = friction + glm::vec4 force; // xyz = force, w = is kinematic (0 or 1) + glm::vec4 torque; // xyz = torque, w = use gravity (0 or 1) + glm::vec4 colliderData; // type-specific data (e.g., radius for spheres) + glm::vec4 colliderData2; // additional collider data (e.g., box half extents) +}; + +/** + * @brief Structure for GPU collision data. + */ +struct GPUCollisionData { + uint32_t bodyA; + uint32_t bodyB; + glm::vec4 contactNormal; // xyz = normal, w = penetration depth + glm::vec4 contactPoint; // xyz = contact point, w = unused +}; + +/** + * @brief Structure for physics simulation parameters. + */ +struct PhysicsParams { + float deltaTime; // Time step - 4 bytes + uint32_t numBodies; // Number of rigid bodies - 4 bytes + uint32_t maxCollisions; // Maximum number of collisions - 4 bytes + float padding; // Explicit padding to align gravity to 16-byte boundary - 4 bytes + glm::vec4 gravity; // Gravity vector (xyz) + padding (w) - 16 bytes + // Total: 32 bytes (aligned to 16-byte boundaries for std140 layout) +}; + +/** + * @brief Structure to store collision prediction data for a ray-based collision system. + */ +struct CollisionPrediction { + float collisionTime = -1.0f; // Time within deltaTime when the collision occurs (-1 = no collision) + glm::vec3 collisionPoint; // World position where collision occurs + glm::vec3 collisionNormal; // Surface normal at collision point + glm::vec3 newVelocity; // Predicted velocity after bounce + Entity* hitEntity = nullptr; // Entity that was hit + bool isValid = false; // Whether this prediction is valid +}; + +/** + * @brief Class for managing physics simulation. + * + * This class implements the physics system as described in the Subsystems chapter: + * @see en/Building_a_Simple_Engine/Subsystems/04_physics_basics.adoc + * @see en/Building_a_Simple_Engine/Subsystems/05_vulkan_physics.adoc + */ +class PhysicsSystem { + public: + /** + * @brief Default constructor. + */ + PhysicsSystem() = default; + + // Constructor-based initialization replacing separate Initialize/Set* calls + explicit PhysicsSystem(Renderer* _renderer, bool enableGPU = true) { + SetRenderer(_renderer); + SetGPUAccelerationEnabled(enableGPU); + if (!Initialize()) { + throw std::runtime_error("PhysicsSystem: initialization failed"); + } + } + + /** + * @brief Destructor for proper cleanup. + */ + ~PhysicsSystem(); + + /** + * @brief Update the physics system. + * @param deltaTime The time elapsed since the last update. + * @param timelineValue The current timeline value to signal upon completion. + * @param frameIndex Optional frame slot index for resource rotation. + */ + void Update(std::chrono::milliseconds deltaTime, uint64_t timelineValue = 0, uint32_t frameIndex = 0); + + /** + * @brief Create a rigid body. + * @param entity The entity to attach the rigid body to. + * @param shape The collision shape. + * @param mass The mass. + * @return Pointer to the created rigid body, or nullptr if creation failed. + */ + RigidBody* CreateRigidBody(Entity* entity, CollisionShape shape, float mass); + + /** + * @brief Destroy a rigid body. + * @param rigidBody The rigid body to destroy. + * @return True if destruction was successful, false otherwise. + */ + bool DestroyRigidBody(RigidBody* rigidBody); + + /** + * @brief Set the gravity of the physics world. + * @param _gravity The gravity vector. + */ + void SetGravity(const glm::vec3& _gravity); + + /** + * @brief Get the gravity of the physics world. + * @return The gravity vector. + */ + [[nodiscard]] glm::vec3 GetGravity() const; + + /** + * @brief Perform a raycast. + * @param origin The origin of the ray. + * @param direction The direction of the ray. + * @param maxDistance The maximum distance of the ray. + * @param hitPosition Output parameter for the hit position. + * @param hitNormal Output parameter for the hit normal. + * @param hitEntity Output parameter for the hit entity. + * @return True if the ray hit something, false otherwise. + */ + bool Raycast(const glm::vec3& origin, + const glm::vec3& direction, + float maxDistance, + glm::vec3* hitPosition, + glm::vec3* hitNormal, + Entity** hitEntity) const; + + /** + * @brief Enable or disable GPU acceleration. + * @param enabled Whether GPU acceleration is enabled. + */ + void SetGPUAccelerationEnabled(bool enabled) { + // Enforce GPU-only policy: disabling GPU acceleration is not allowed in this project. + // Ignore attempts to disable and keep GPU acceleration enabled. + gpuAccelerationEnabled = true; + } + + /** + * @brief Check if GPU acceleration is enabled. + * @return True, if GPU acceleration is enabled, false otherwise. + */ + [[nodiscard]] bool IsGPUAccelerationEnabled() const { + return gpuAccelerationEnabled; + } + + /** + * @brief Set the maximum number of objects that can be simulated on the GPU. + * @param maxObjects The maximum number of objects. + */ + void SetMaxGPUObjects(uint32_t maxObjects) { + maxGPUObjects = maxObjects; + } + + /** + * @brief Set the renderer to use during GPU acceleration. + * @param _renderer The renderer. + */ + void SetRenderer(Renderer* _renderer) { + renderer = _renderer; + } + + /** + * @brief Set the current camera position for geometry-relative ball checking. + * @param _cameraPosition The current camera position. + */ + void SetCameraPosition(const glm::vec3& _cameraPosition) { + cameraPosition = _cameraPosition; + } + + // Thread-safe enqueue for rigid body creation from any thread + void EnqueueRigidBodyCreation(Entity* entity, + CollisionShape shape, + float mass, + bool kinematic, + float restitution, + float friction); + + private: + /** + * @brief Initialize the physics system (called by constructor). + * @return True if initialization was successful, false otherwise. + */ + bool Initialize(); + + /** + * @brief Clean up rigid bodies that are marked for removal. + */ + void CleanupMarkedBodies(); + + /** + * @brief Helper function to create a mapped buffer with memory allocation. + * @param size The size of the buffer in bytes. + * @param usage The buffer usage flags. + * @param buffer Reference to the buffer RAII object. + * @param memory Reference to the memory RAII object. + * @param errorPrefix Prefix for error messages. + */ + void CreateMappedBuffer(vk::DeviceSize size, + vk::BufferUsageFlags usage, + vk::raii::Buffer& buffer, + vk::raii::DeviceMemory& memory, + const std::string& errorPrefix); + + // Pending rigid body creations queued from background threads + struct PendingCreation { + Entity* entity; + CollisionShape shape; + float mass; + bool kinematic; + float restitution; + float friction; + }; + std::mutex pendingMutex; + std::vector pendingCreations; + + // Rigid bodies + mutable std::mutex rigidBodiesMutex; // Protect concurrent access to rigidBodies + std::vector> rigidBodies; + + // Gravity + glm::vec3 gravity = glm::vec3(0.0f, -9.81f, 0.0f); + + // Whether the physics system is initialized + bool initialized = false; + + // GPU acceleration + bool gpuAccelerationEnabled = false; + uint32_t maxGPUObjects = 1024; + uint32_t maxGPUCollisions = 4096; + Renderer* renderer = nullptr; + + // Camera position for geometry-relative ball checking + glm::vec3 cameraPosition = glm::vec3(0.0f, 0.0f, 0.0f); + + // Track dynamic bodies for simulation optimization + mutable uint32_t lastDynamicBodyCount = 0; + mutable bool hasDynamicBodies = false; + mutable bool needsBroadPhase = true; + + // Vulkan resources for physics simulation + struct VulkanResources { + // Shader modules + vk::raii::ShaderModule integrateShaderModule = nullptr; + vk::raii::ShaderModule broadPhaseShaderModule = nullptr; + vk::raii::ShaderModule narrowPhaseShaderModule = nullptr; + vk::raii::ShaderModule resolveShaderModule = nullptr; + + // Pipeline layouts and compute pipelines + vk::raii::DescriptorSetLayout descriptorSetLayout = nullptr; + vk::raii::PipelineLayout pipelineLayout = nullptr; + vk::raii::Pipeline integratePipeline = nullptr; + vk::raii::Pipeline broadPhasePipeline = nullptr; + vk::raii::Pipeline narrowPhasePipeline = nullptr; + vk::raii::Pipeline resolvePipeline = nullptr; + + // Descriptor pool and sets + vk::raii::DescriptorPool descriptorPool = nullptr; + std::vector descriptorSets; + + // Buffers for physics data + vk::raii::Buffer physicsBuffer = nullptr; + vk::raii::DeviceMemory physicsBufferMemory = nullptr; + vk::raii::Buffer collisionBuffer = nullptr; + vk::raii::DeviceMemory collisionBufferMemory = nullptr; + vk::raii::Buffer pairBuffer = nullptr; + vk::raii::DeviceMemory pairBufferMemory = nullptr; + vk::raii::Buffer counterBuffer = nullptr; + vk::raii::DeviceMemory counterBufferMemory = nullptr; + vk::raii::Buffer paramsBuffer = nullptr; + vk::raii::DeviceMemory paramsBufferMemory = nullptr; + + // Persistent mapped memory pointers for improved performance + void* persistentPhysicsMemory = nullptr; + void* persistentCounterMemory = nullptr; + void* persistentParamsMemory = nullptr; + + // Command buffers for compute operations (one per frame in flight) + vk::raii::CommandPool commandPool = nullptr; + std::vector commandBuffers; + + // Dedicated fence for compute synchronization + vk::raii::Fence computeFence = nullptr; + }; + + VulkanResources vulkanResources; + + // Initialize Vulkan resources for physics simulation + bool InitializeVulkanResources(); + void CleanupVulkanResources(); + + // Update physics data on the GPU + void UpdateGPUPhysicsData(std::chrono::milliseconds deltaTime) const; + + // Read back physics data from the GPU + void ReadbackGPUPhysicsData() const; + + // Perform GPU-accelerated physics simulation + void SimulatePhysicsOnGPU(std::chrono::milliseconds deltaTime, uint64_t timelineValue = 0, uint32_t frameIndex = 0) const; +}; diff --git a/attachments/sync2_engine/pipeline.cpp b/attachments/sync2_engine/pipeline.cpp new file mode 100644 index 00000000..42fe2dfa --- /dev/null +++ b/attachments/sync2_engine/pipeline.cpp @@ -0,0 +1,727 @@ +/* Copyright (c) 2025 Holochip Corporation + * + * SPDX-License-Identifier: Apache-2.0 + * + * Licensed under the Apache License, Version 2.0 the "License"; + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +#include "pipeline.h" +#include "mesh_component.h" +#include +#include + +// Constructor +Pipeline::Pipeline(VulkanDevice& device, SwapChain& swapChain) : device(device), swapChain(swapChain) { +} + +// Create descriptor set layout +bool Pipeline::createDescriptorSetLayout() { + try { + // Create descriptor set layout bindings + std::array bindings = { + vk::DescriptorSetLayoutBinding{ + .binding = 0, + .descriptorType = vk::DescriptorType::eUniformBuffer, + .descriptorCount = 1, + .stageFlags = vk::ShaderStageFlagBits::eVertex | vk::ShaderStageFlagBits::eFragment, + .pImmutableSamplers = nullptr + }, + vk::DescriptorSetLayoutBinding{ + .binding = 1, + .descriptorType = vk::DescriptorType::eCombinedImageSampler, + .descriptorCount = 1, + .stageFlags = vk::ShaderStageFlagBits::eFragment, + .pImmutableSamplers = nullptr + } + }; + + // Create descriptor set layout + vk::DescriptorSetLayoutCreateInfo layoutInfo{ + .bindingCount = static_cast(bindings.size()), + .pBindings = bindings.data() + }; + + descriptorSetLayout = vk::raii::DescriptorSetLayout(device.getDevice(), layoutInfo); + + return true; + } catch (const std::exception& e) { + std::cerr << "Failed to create descriptor set layout: " << e.what() << std::endl; + return false; + } +} + +// Create PBR descriptor set layout +bool Pipeline::createPBRDescriptorSetLayout() { + try { + // Create descriptor set layout bindings for PBR shader + std::array bindings = { + // Binding 0: Uniform buffer (UBO) + vk::DescriptorSetLayoutBinding{ + .binding = 0, + .descriptorType = vk::DescriptorType::eUniformBuffer, + .descriptorCount = 1, + .stageFlags = vk::ShaderStageFlagBits::eVertex | vk::ShaderStageFlagBits::eFragment, + .pImmutableSamplers = nullptr + }, + // Binding 1: Base color map and sampler + vk::DescriptorSetLayoutBinding{ + .binding = 1, + .descriptorType = vk::DescriptorType::eCombinedImageSampler, + .descriptorCount = 1, + .stageFlags = vk::ShaderStageFlagBits::eFragment, + .pImmutableSamplers = nullptr + }, + // Binding 2: Metallic roughness map and sampler + vk::DescriptorSetLayoutBinding{ + .binding = 2, + .descriptorType = vk::DescriptorType::eCombinedImageSampler, + .descriptorCount = 1, + .stageFlags = vk::ShaderStageFlagBits::eFragment, + .pImmutableSamplers = nullptr + }, + // Binding 3: Normal map and sampler + vk::DescriptorSetLayoutBinding{ + .binding = 3, + .descriptorType = vk::DescriptorType::eCombinedImageSampler, + .descriptorCount = 1, + .stageFlags = vk::ShaderStageFlagBits::eFragment, + .pImmutableSamplers = nullptr + }, + // Binding 4: Occlusion map and sampler + vk::DescriptorSetLayoutBinding{ + .binding = 4, + .descriptorType = vk::DescriptorType::eCombinedImageSampler, + .descriptorCount = 1, + .stageFlags = vk::ShaderStageFlagBits::eFragment, + .pImmutableSamplers = nullptr + }, + // Binding 5: Emissive map and sampler + vk::DescriptorSetLayoutBinding{ + .binding = 5, + .descriptorType = vk::DescriptorType::eCombinedImageSampler, + .descriptorCount = 1, + .stageFlags = vk::ShaderStageFlagBits::eFragment, + .pImmutableSamplers = nullptr + }, + // Binding 6: Light storage buffer (StructuredBuffer) + vk::DescriptorSetLayoutBinding{ + .binding = 6, + .descriptorType = vk::DescriptorType::eStorageBuffer, + .descriptorCount = 1, + .stageFlags = vk::ShaderStageFlagBits::eFragment, + .pImmutableSamplers = nullptr + } + }; + + // Create descriptor set layout + vk::DescriptorSetLayoutCreateInfo layoutInfo{ + .bindingCount = static_cast(bindings.size()), + .pBindings = bindings.data() + }; + + pbrDescriptorSetLayout = vk::raii::DescriptorSetLayout(device.getDevice(), layoutInfo); + + return true; + } catch (const std::exception& e) { + std::cerr << "Failed to create PBR descriptor set layout: " << e.what() << std::endl; + return false; + } +} + +// Create graphics pipeline +bool Pipeline::createGraphicsPipeline() { + try { + // Read shader code + auto vertShaderCode = readFile("shaders/texturedMesh.spv"); + auto fragShaderCode = readFile("shaders/texturedMesh.spv"); + + // Create shader modules + vk::raii::ShaderModule vertShaderModule = createShaderModule(vertShaderCode); + vk::raii::ShaderModule fragShaderModule = createShaderModule(fragShaderCode); + + // Create shader stage info + vk::PipelineShaderStageCreateInfo vertShaderStageInfo{ + .stage = vk::ShaderStageFlagBits::eVertex, + .module = *vertShaderModule, + .pName = "VSMain" + }; + + vk::PipelineShaderStageCreateInfo fragShaderStageInfo{ + .stage = vk::ShaderStageFlagBits::eFragment, + .module = *fragShaderModule, + .pName = "PSMain" + }; + + vk::PipelineShaderStageCreateInfo shaderStages[] = {vertShaderStageInfo, fragShaderStageInfo}; + + // Create vertex input info + vk::PipelineVertexInputStateCreateInfo vertexInputInfo{ + .vertexBindingDescriptionCount = 0, + .pVertexBindingDescriptions = nullptr, + .vertexAttributeDescriptionCount = 0, + .pVertexAttributeDescriptions = nullptr + }; + + // Create input assembly info + vk::PipelineInputAssemblyStateCreateInfo inputAssembly{ + .topology = vk::PrimitiveTopology::eTriangleList, + .primitiveRestartEnable = VK_FALSE + }; + + // Create viewport state info + // Note: viewport and scissor are dynamic states, so we only need counts + vk::PipelineViewportStateCreateInfo viewportState{ + .viewportCount = 1, + .pViewports = nullptr, + .scissorCount = 1, + .pScissors = nullptr + }; + + // Create rasterization state info + vk::PipelineRasterizationStateCreateInfo rasterizer{ + .depthClampEnable = VK_FALSE, + .rasterizerDiscardEnable = VK_FALSE, + .polygonMode = vk::PolygonMode::eFill, + .cullMode = vk::CullModeFlagBits::eBack, + .frontFace = vk::FrontFace::eCounterClockwise, + .depthBiasEnable = VK_FALSE, + .depthBiasConstantFactor = 0.0f, + .depthBiasClamp = 0.0f, + .depthBiasSlopeFactor = 0.0f, + .lineWidth = 1.0f + }; + + // Create multisample state info + vk::PipelineMultisampleStateCreateInfo multisampling{ + .rasterizationSamples = vk::SampleCountFlagBits::e1, + .sampleShadingEnable = VK_FALSE, + .minSampleShading = 1.0f, + .pSampleMask = nullptr, + .alphaToCoverageEnable = VK_FALSE, + .alphaToOneEnable = VK_FALSE + }; + + // Create depth stencil state info + vk::PipelineDepthStencilStateCreateInfo depthStencil{ + .depthTestEnable = VK_TRUE, + .depthWriteEnable = VK_TRUE, + .depthCompareOp = vk::CompareOp::eLess, + .depthBoundsTestEnable = VK_FALSE, + .stencilTestEnable = VK_FALSE, + .front = {}, + .back = {}, + .minDepthBounds = 0.0f, + .maxDepthBounds = 1.0f + }; + + // Create color blend attachment state + vk::PipelineColorBlendAttachmentState colorBlendAttachment{ + .blendEnable = VK_FALSE, + .srcColorBlendFactor = vk::BlendFactor::eOne, + .dstColorBlendFactor = vk::BlendFactor::eZero, + .colorBlendOp = vk::BlendOp::eAdd, + .srcAlphaBlendFactor = vk::BlendFactor::eOne, + .dstAlphaBlendFactor = vk::BlendFactor::eZero, + .alphaBlendOp = vk::BlendOp::eAdd, + .colorWriteMask = vk::ColorComponentFlagBits::eR | vk::ColorComponentFlagBits::eG | vk::ColorComponentFlagBits::eB | vk::ColorComponentFlagBits::eA + }; + + // Create color blend state info + std::array blendConstants = {0.0f, 0.0f, 0.0f, 0.0f}; + vk::PipelineColorBlendStateCreateInfo colorBlending{ + .logicOpEnable = VK_FALSE, + .logicOp = vk::LogicOp::eCopy, + .attachmentCount = 1, + .pAttachments = &colorBlendAttachment, + .blendConstants = blendConstants + }; + + // Create dynamic state info + std::vector dynamicStates = { + vk::DynamicState::eViewport, + vk::DynamicState::eScissor + }; + + vk::PipelineDynamicStateCreateInfo dynamicState{ + .dynamicStateCount = static_cast(dynamicStates.size()), + .pDynamicStates = dynamicStates.data() + }; + + // Create pipeline layout + vk::PipelineLayoutCreateInfo pipelineLayoutInfo{ + .setLayoutCount = 1, + .pSetLayouts = &*descriptorSetLayout, + .pushConstantRangeCount = 0, + .pPushConstantRanges = nullptr + }; + + pipelineLayout = vk::raii::PipelineLayout(device.getDevice(), pipelineLayoutInfo); + + // Create graphics pipeline + vk::GraphicsPipelineCreateInfo pipelineInfo{ + .stageCount = 2, + .pStages = shaderStages, + .pVertexInputState = &vertexInputInfo, + .pInputAssemblyState = &inputAssembly, + .pViewportState = &viewportState, + .pRasterizationState = &rasterizer, + .pMultisampleState = &multisampling, + .pDepthStencilState = &depthStencil, + .pColorBlendState = &colorBlending, + .pDynamicState = &dynamicState, + .layout = *pipelineLayout, + .renderPass = nullptr, + .subpass = 0, + .basePipelineHandle = nullptr, + .basePipelineIndex = -1 + }; + + // Create pipeline with dynamic rendering + vk::Format swapChainFormat = swapChain.getSwapChainImageFormat(); + vk::PipelineRenderingCreateInfo renderingInfo{ + .colorAttachmentCount = 1, + .pColorAttachmentFormats = &swapChainFormat, + .depthAttachmentFormat = vk::Format::eD32Sfloat, + .stencilAttachmentFormat = vk::Format::eUndefined + }; + + pipelineInfo.pNext = &renderingInfo; + + graphicsPipeline = vk::raii::Pipeline(device.getDevice(), nullptr, pipelineInfo); + + return true; + } catch (const std::exception& e) { + std::cerr << "Failed to create graphics pipeline: " << e.what() << std::endl; + return false; + } +} + +// Create PBR pipeline +bool Pipeline::createPBRPipeline() { + try { + // Create PBR descriptor set layout + if (!createPBRDescriptorSetLayout()) { + return false; + } + + // Read shader code + auto vertShaderCode = readFile("shaders/pbr.spv"); + auto fragShaderCode = readFile("shaders/pbr.spv"); + + // Create shader modules + vk::raii::ShaderModule vertShaderModule = createShaderModule(vertShaderCode); + vk::raii::ShaderModule fragShaderModule = createShaderModule(fragShaderCode); + + // Create shader stage info + vk::PipelineShaderStageCreateInfo vertShaderStageInfo{ + .stage = vk::ShaderStageFlagBits::eVertex, + .module = *vertShaderModule, + .pName = "VSMain" + }; + + vk::PipelineShaderStageCreateInfo fragShaderStageInfo{ + .stage = vk::ShaderStageFlagBits::eFragment, + .module = *fragShaderModule, + .pName = "PSMain" // Changed from FSMain to PSMain to match the shader + }; + + vk::PipelineShaderStageCreateInfo shaderStages[] = {vertShaderStageInfo, fragShaderStageInfo}; + + // Define vertex and instance binding descriptions using MeshComponent layouts + auto vertexBinding = Vertex::getBindingDescription(); + auto instanceBinding = InstanceData::getBindingDescription(); + std::array bindingDescriptions = {vertexBinding, instanceBinding}; + + // Define vertex and instance attribute descriptions + auto vertexAttrArray = Vertex::getAttributeDescriptions(); + auto instanceAttrArray = InstanceData::getAttributeDescriptions(); + std::array attributeDescriptions{}; + // Copy vertex attributes (0..3) + for (size_t i = 0; i < vertexAttrArray.size(); ++i) { + attributeDescriptions[i] = vertexAttrArray[i]; + } + // Copy instance attributes (4..10) + for (size_t i = 0; i < instanceAttrArray.size(); ++i) { + attributeDescriptions[vertexAttrArray.size() + i] = instanceAttrArray[i]; + } + + // Create vertex input info + vk::PipelineVertexInputStateCreateInfo vertexInputInfo{ + .vertexBindingDescriptionCount = static_cast(bindingDescriptions.size()), + .pVertexBindingDescriptions = bindingDescriptions.data(), + .vertexAttributeDescriptionCount = static_cast(attributeDescriptions.size()), + .pVertexAttributeDescriptions = attributeDescriptions.data() + }; + + // Create input assembly info + vk::PipelineInputAssemblyStateCreateInfo inputAssembly{ + .topology = vk::PrimitiveTopology::eTriangleList, + .primitiveRestartEnable = VK_FALSE + }; + + // Create viewport state info + // Note: viewport and scissor are dynamic states, so we only need counts + vk::PipelineViewportStateCreateInfo viewportState{ + .viewportCount = 1, + .pViewports = nullptr, + .scissorCount = 1, + .pScissors = nullptr + }; + + // Create rasterization state info + vk::PipelineRasterizationStateCreateInfo rasterizer{ + .depthClampEnable = VK_FALSE, + .rasterizerDiscardEnable = VK_FALSE, + .polygonMode = vk::PolygonMode::eFill, + .cullMode = vk::CullModeFlagBits::eBack, + .frontFace = vk::FrontFace::eCounterClockwise, + .depthBiasEnable = VK_FALSE, + .depthBiasConstantFactor = 0.0f, + .depthBiasClamp = 0.0f, + .depthBiasSlopeFactor = 0.0f, + .lineWidth = 1.0f + }; + + // Create multisample state info + vk::PipelineMultisampleStateCreateInfo multisampling{ + .rasterizationSamples = vk::SampleCountFlagBits::e1, + .sampleShadingEnable = VK_FALSE, + .minSampleShading = 1.0f, + .pSampleMask = nullptr, + .alphaToCoverageEnable = VK_TRUE, + .alphaToOneEnable = VK_FALSE + }; + + // Create depth stencil state info + vk::PipelineDepthStencilStateCreateInfo depthStencil{ + .depthTestEnable = VK_TRUE, + .depthWriteEnable = VK_TRUE, + .depthCompareOp = vk::CompareOp::eLess, + .depthBoundsTestEnable = VK_FALSE, + .stencilTestEnable = VK_FALSE, + .front = {}, + .back = {}, + .minDepthBounds = 0.0f, + .maxDepthBounds = 1.0f + }; + + // Create color blend attachment state + vk::PipelineColorBlendAttachmentState colorBlendAttachment{ + .blendEnable = VK_FALSE, + .srcColorBlendFactor = vk::BlendFactor::eOne, + .dstColorBlendFactor = vk::BlendFactor::eZero, + .colorBlendOp = vk::BlendOp::eAdd, + .srcAlphaBlendFactor = vk::BlendFactor::eOne, + .dstAlphaBlendFactor = vk::BlendFactor::eZero, + .alphaBlendOp = vk::BlendOp::eAdd, + .colorWriteMask = vk::ColorComponentFlagBits::eR | vk::ColorComponentFlagBits::eG | vk::ColorComponentFlagBits::eB | vk::ColorComponentFlagBits::eA + }; + + // Create color blend state info + std::array < float, 4 > blendConstants = {0.0f, 0.0f, 0.0f, 0.0f}; + vk::PipelineColorBlendStateCreateInfo colorBlending{ + .logicOpEnable = VK_FALSE, + .logicOp = vk::LogicOp::eCopy, + .attachmentCount = 1, + .pAttachments = &colorBlendAttachment, + .blendConstants = blendConstants + }; + + // Create dynamic state info + std::vector dynamicStates = { + vk::DynamicState::eViewport, + vk::DynamicState::eScissor + }; + + vk::PipelineDynamicStateCreateInfo dynamicState{ + .dynamicStateCount = static_cast(dynamicStates.size()), + .pDynamicStates = dynamicStates.data() + }; + + // Create push constant range for material properties + vk::PushConstantRange pushConstantRange{ + .stageFlags = vk::ShaderStageFlagBits::eFragment, + .offset = 0, + .size = sizeof(MaterialProperties) + }; + + // Create pipeline layout using the PBR descriptor set layout + vk::PipelineLayoutCreateInfo pipelineLayoutInfo{ + .setLayoutCount = 1, + .pSetLayouts = &*pbrDescriptorSetLayout, // Use PBR descriptor set layout + .pushConstantRangeCount = 1, + .pPushConstantRanges = &pushConstantRange + }; + + pbrPipelineLayout = vk::raii::PipelineLayout(device.getDevice(), pipelineLayoutInfo); + + // Create graphics pipeline + vk::GraphicsPipelineCreateInfo pipelineInfo{ + .stageCount = 2, + .pStages = shaderStages, + .pVertexInputState = &vertexInputInfo, + .pInputAssemblyState = &inputAssembly, + .pViewportState = &viewportState, + .pRasterizationState = &rasterizer, + .pMultisampleState = &multisampling, + .pDepthStencilState = &depthStencil, + .pColorBlendState = &colorBlending, + .pDynamicState = &dynamicState, + .layout = *pbrPipelineLayout, + .renderPass = nullptr, + .subpass = 0, + .basePipelineHandle = nullptr, + .basePipelineIndex = -1 + }; + + // Create pipeline with dynamic rendering + vk::Format swapChainFormat = swapChain.getSwapChainImageFormat(); + vk::PipelineRenderingCreateInfo renderingInfo{ + .colorAttachmentCount = 1, + .pColorAttachmentFormats = &swapChainFormat, + .depthAttachmentFormat = vk::Format::eD32Sfloat, + .stencilAttachmentFormat = vk::Format::eUndefined + }; + + pipelineInfo.pNext = &renderingInfo; + + pbrGraphicsPipeline = vk::raii::Pipeline(device.getDevice(), nullptr, pipelineInfo); + + return true; + } catch (const std::exception& e) { + std::cerr << "Failed to create PBR pipeline: " << e.what() << std::endl; + return false; + } +} + +// Create lighting pipeline +bool Pipeline::createLightingPipeline() { + try { + // Read shader code + auto vertShaderCode = readFile("shaders/lighting.spv"); + auto fragShaderCode = readFile("shaders/lighting.spv"); + + // Create shader modules + vk::raii::ShaderModule vertShaderModule = createShaderModule(vertShaderCode); + vk::raii::ShaderModule fragShaderModule = createShaderModule(fragShaderCode); + + // Create shader stage info + vk::PipelineShaderStageCreateInfo vertShaderStageInfo{ + .stage = vk::ShaderStageFlagBits::eVertex, + .module = *vertShaderModule, + .pName = "VSMain" + }; + + vk::PipelineShaderStageCreateInfo fragShaderStageInfo{ + .stage = vk::ShaderStageFlagBits::eFragment, + .module = *fragShaderModule, + .pName = "PSMain" + }; + + vk::PipelineShaderStageCreateInfo shaderStages[] = {vertShaderStageInfo, fragShaderStageInfo}; + + // Create vertex input info + vk::PipelineVertexInputStateCreateInfo vertexInputInfo{ + .vertexBindingDescriptionCount = 0, + .pVertexBindingDescriptions = nullptr, + .vertexAttributeDescriptionCount = 0, + .pVertexAttributeDescriptions = nullptr + }; + + // Create input assembly info + vk::PipelineInputAssemblyStateCreateInfo inputAssembly{ + .topology = vk::PrimitiveTopology::eTriangleList, + .primitiveRestartEnable = VK_FALSE + }; + + // Create viewport state info + // Note: viewport and scissor are dynamic states, so we only need counts + vk::PipelineViewportStateCreateInfo viewportState{ + .viewportCount = 1, + .pViewports = nullptr, + .scissorCount = 1, + .pScissors = nullptr + }; + + // Create rasterization state info + vk::PipelineRasterizationStateCreateInfo rasterizer{ + .depthClampEnable = VK_FALSE, + .rasterizerDiscardEnable = VK_FALSE, + .polygonMode = vk::PolygonMode::eFill, + .cullMode = vk::CullModeFlagBits::eBack, + .frontFace = vk::FrontFace::eCounterClockwise, + .depthBiasEnable = VK_FALSE, + .depthBiasConstantFactor = 0.0f, + .depthBiasClamp = 0.0f, + .depthBiasSlopeFactor = 0.0f, + .lineWidth = 1.0f + }; + + // Create multisample state info + vk::PipelineMultisampleStateCreateInfo multisampling{ + .rasterizationSamples = vk::SampleCountFlagBits::e1, + .sampleShadingEnable = VK_FALSE, + .minSampleShading = 1.0f, + .pSampleMask = nullptr, + .alphaToCoverageEnable = VK_FALSE, + .alphaToOneEnable = VK_FALSE + }; + + // Create depth stencil state info + vk::PipelineDepthStencilStateCreateInfo depthStencil{ + .depthTestEnable = VK_TRUE, + .depthWriteEnable = VK_TRUE, + .depthCompareOp = vk::CompareOp::eLess, + .depthBoundsTestEnable = VK_FALSE, + .stencilTestEnable = VK_FALSE, + .front = {}, + .back = {}, + .minDepthBounds = 0.0f, + .maxDepthBounds = 1.0f + }; + + // Create color blend attachment state + vk::PipelineColorBlendAttachmentState colorBlendAttachment{ + .blendEnable = VK_FALSE, + .srcColorBlendFactor = vk::BlendFactor::eOne, + .dstColorBlendFactor = vk::BlendFactor::eZero, + .colorBlendOp = vk::BlendOp::eAdd, + .srcAlphaBlendFactor = vk::BlendFactor::eOne, + .dstAlphaBlendFactor = vk::BlendFactor::eZero, + .alphaBlendOp = vk::BlendOp::eAdd, + .colorWriteMask = vk::ColorComponentFlagBits::eR | vk::ColorComponentFlagBits::eG | vk::ColorComponentFlagBits::eB | vk::ColorComponentFlagBits::eA + }; + + // Create color blend state info + std::array < float, 4 > blendConstants = {0.0f, 0.0f, 0.0f, 0.0f}; + vk::PipelineColorBlendStateCreateInfo colorBlending{ + .logicOpEnable = VK_FALSE, + .logicOp = vk::LogicOp::eCopy, + .attachmentCount = 1, + .pAttachments = &colorBlendAttachment, + .blendConstants = blendConstants + }; + + // Create dynamic state info + std::vector dynamicStates = { + vk::DynamicState::eViewport, + vk::DynamicState::eScissor + }; + + vk::PipelineDynamicStateCreateInfo dynamicState{ + .dynamicStateCount = static_cast(dynamicStates.size()), + .pDynamicStates = dynamicStates.data() + }; + + // Create push constant range for material properties + vk::PushConstantRange pushConstantRange{ + .stageFlags = vk::ShaderStageFlagBits::eFragment, + .offset = 0, + .size = sizeof(MaterialProperties) + }; + + // Create pipeline layout + vk::PipelineLayoutCreateInfo pipelineLayoutInfo{ + .setLayoutCount = 1, + .pSetLayouts = &*descriptorSetLayout, + .pushConstantRangeCount = 1, + .pPushConstantRanges = &pushConstantRange + }; + + lightingPipelineLayout = vk::raii::PipelineLayout(device.getDevice(), pipelineLayoutInfo); + + // Create graphics pipeline + vk::GraphicsPipelineCreateInfo pipelineInfo{ + .stageCount = 2, + .pStages = shaderStages, + .pVertexInputState = &vertexInputInfo, + .pInputAssemblyState = &inputAssembly, + .pViewportState = &viewportState, + .pRasterizationState = &rasterizer, + .pMultisampleState = &multisampling, + .pDepthStencilState = &depthStencil, + .pColorBlendState = &colorBlending, + .pDynamicState = &dynamicState, + .layout = *lightingPipelineLayout, + .renderPass = nullptr, + .subpass = 0, + .basePipelineHandle = nullptr, + .basePipelineIndex = -1 + }; + + // Create pipeline with dynamic rendering + vk::Format swapChainFormat = swapChain.getSwapChainImageFormat(); + vk::PipelineRenderingCreateInfo renderingInfo{ + .colorAttachmentCount = 1, + .pColorAttachmentFormats = &swapChainFormat, + .depthAttachmentFormat = vk::Format::eD32Sfloat, + .stencilAttachmentFormat = vk::Format::eUndefined + }; + + pipelineInfo.pNext = &renderingInfo; + + lightingPipeline = vk::raii::Pipeline(device.getDevice(), nullptr, pipelineInfo); + + return true; + } catch (const std::exception& e) { + std::cerr << "Failed to create lighting pipeline: " << e.what() << std::endl; + return false; + } +} + +// Push material properties +void Pipeline::pushMaterialProperties(vk::CommandBuffer commandBuffer, const MaterialProperties& material) { + commandBuffer.pushConstants(*pbrPipelineLayout, vk::ShaderStageFlagBits::eFragment, 0, material); +} + +// Create shader module +vk::raii::ShaderModule Pipeline::createShaderModule(const std::vector& code) { + vk::ShaderModuleCreateInfo createInfo{ + .codeSize = code.size(), + .pCode = reinterpret_cast(code.data()) + }; + + return vk::raii::ShaderModule(device.getDevice(), createInfo); +} + +// Read file +std::vector Pipeline::readFile(const std::string& filename) { + std::vector searchPaths = { + filename, + "cmake-build-debug/" + filename, + "cmake-build-release/" + filename, + "build/" + filename, + "../simple_engine/" + filename, + "../simple_engine/cmake-build-debug/" + filename, + "../" + filename, + "../../" + filename + }; + + for (const auto& path : searchPaths) { + std::ifstream file(path, std::ios::ate | std::ios::binary); + if (file.is_open()) { + size_t fileSize = file.tellg(); + std::vector buffer(fileSize); + file.seekg(0); + file.read(buffer.data(), fileSize); + file.close(); + return buffer; + } + } + + std::cerr << "CRITICAL: Pipeline::Failed to open " << filename << " in any of the following paths:" << std::endl; + for (const auto& path : searchPaths) { + std::cerr << " - " << path << std::endl; + } + + throw std::runtime_error("Failed to open file in any search path: " + filename); +} diff --git a/attachments/sync2_engine/renderer.h b/attachments/sync2_engine/renderer.h new file mode 100644 index 00000000..5438de12 --- /dev/null +++ b/attachments/sync2_engine/renderer.h @@ -0,0 +1,2181 @@ +/* Copyright (c) 2025 Holochip Corporation + * + * SPDX-License-Identifier: Apache-2.0 + * + * Licensed under the Apache License, Version 2.0 the "License"; + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +#pragma once + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include "camera_component.h" +#include "entity.h" +#include "memory_pool.h" +#include "mesh_component.h" +#include "model_loader.h" +#include "platform.h" +#include "thread_pool.h" + +// Fallback defines for optional extension names (allow compiling against older headers) +#ifndef VK_EXT_ROBUSTNESS_2_EXTENSION_NAME +# define VK_EXT_ROBUSTNESS_2_EXTENSION_NAME "VK_EXT_robustness2" +#endif +#ifndef VK_KHR_DYNAMIC_RENDERING_LOCAL_READ_EXTENSION_NAME +# define VK_KHR_DYNAMIC_RENDERING_LOCAL_READ_EXTENSION_NAME "VK_KHR_dynamic_rendering_local_read" +#endif +#ifndef VK_EXT_SHADER_TILE_IMAGE_EXTENSION_NAME +# define VK_EXT_SHADER_TILE_IMAGE_EXTENSION_NAME "VK_EXT_shader_tile_image" +#endif + +// Forward declarations +class ImGuiSystem; + +/** + * @brief Structure for Vulkan queue family indices. + */ +struct QueueFamilyIndices { + std::optional graphicsFamily; + std::optional presentFamily; + std::optional computeFamily; + std::optional transferFamily; // optional dedicated transfer queue family + + [[nodiscard]] bool isComplete() const { + return graphicsFamily.has_value() && presentFamily.has_value() && computeFamily.has_value(); + } +}; + +/** + * @brief Structure for swap chain support details. + */ +struct SwapChainSupportDetails { + vk::SurfaceCapabilitiesKHR capabilities; + std::vector formats; + std::vector presentModes; +}; + +/** + * @brief Structure for individual light data in the storage buffer. + */ +struct LightData { + alignas(16) glm::vec4 position; // Light position (w component used for direction vs position) + alignas(16) glm::vec4 color; // Light color and intensity + alignas(16) glm::mat4 lightSpaceMatrix; // Light space matrix for shadow mapping + alignas(16) glm::vec4 direction; // Light direction (for directional/spotlights) + alignas(4) int lightType; // 0=Point, 1=Directional, 2=Spot, 3=Emissive + alignas(4) float range; // Light range + alignas(4) float innerConeAngle; // For spotlights + alignas(4) float outerConeAngle; // For spotlights +}; + +struct ShadowUniforms { + alignas(16) glm::mat4 view; + alignas(16) glm::mat4 proj; +}; + +struct ShadowPushConstants { + alignas(16) glm::mat4 model; +}; + +/** + * @brief Structure for the uniform buffer object (now without fixed light arrays). + */ +struct UniformBufferObject { + alignas(16) glm::mat4 model; + alignas(16) glm::mat4 view; + alignas(16) glm::mat4 proj; + alignas(16) glm::vec4 camPos; + alignas(4) float exposure; + alignas(4) float gamma; + alignas(4) float prefilteredCubeMipLevels; + alignas(4) float scaleIBLAmbient; + alignas(4) int lightCount; + alignas(4) int padding0; // match shader UBO layout + alignas(4) float padding1; // match shader UBO layout + alignas(4) float padding2; // match shader UBO layout + alignas(8) glm::vec2 screenDimensions; + alignas(4) float nearZ; + alignas(4) float farZ; + alignas(4) float slicesZ; + alignas(4) float _uboPad3; + // Planar reflections + alignas(16) glm::mat4 reflectionVP; // projection * mirroredView + alignas(4) int reflectionEnabled; // 1 when sampling reflection in main pass + alignas(4) int reflectionPass; // 1 during reflection render pass + alignas(8) glm::vec2 _reflectPad0; + alignas(16) glm::vec4 clipPlaneWS; // world-space plane ax+by+cz+d=0 + // Controls + alignas(4) float reflectionIntensity; // scales reflection mix in glass + alignas(4) int enableRayQueryReflections = 1; // 1 to enable reflections in ray query mode + alignas(4) int enableRayQueryTransparency = 1; // 1 to enable transparency/refraction in ray query mode + alignas(4) float _padReflect[1]{}; + // Ray-query specific: number of per-instance geometry infos in buffer + alignas(4) int geometryInfoCount{0}; + alignas(4) int _padGeo0{0}; + alignas(4) int _padGeo1{0}; + alignas(4) int _padGeo2{0}; + alignas(16) glm::vec4 _rqReservedWorldPos{0.0f, 0.0f, 0.0f, 0.0f}; + // Ray-query specific: number of materials in materialBuffer + alignas(4) int materialCount{0}; + alignas(4) int _padMat0{0}; + alignas(4) int _padMat1{0}; + alignas(4) int _padMat2{0}; +}; + +// Ray Query uses a dedicated uniform buffer with its own tightly-defined layout. +// This avoids relying on the (much larger) shared raster UBO layout and prevents +// CPU↔shader layout drift from breaking Ray Query-only fields. +// +// IMPORTANT: This layout must match `RayQueryUniforms` in `shaders/ray_query.slang`. +struct RayQueryUniformBufferObject { + alignas(16) glm::mat4 model; + alignas(16) glm::mat4 view; + alignas(16) glm::mat4 proj; + alignas(16) glm::vec4 camPos; + + alignas(4) float exposure; + alignas(4) float gamma; + // Match raster UBO conventions so Ray Query can run the same lighting math. + alignas(4) float scaleIBLAmbient; + alignas(4) int lightCount; + alignas(4) int enableRayQueryReflections; + alignas(4) int enableRayQueryTransparency; + + alignas(8) glm::vec2 screenDimensions; + alignas(4) int geometryInfoCount; + alignas(4) int materialCount; + alignas(4) int _pad0; // used for rayQueryMaxBounces + // Thick-glass controls (RQ-only) + alignas(4) int enableThickGlass; // 0/1 toggle + alignas(4) float thicknessClamp; // max thickness in meters + alignas(4) float absorptionScale; // scales sigma_a + alignas(4) int _pad1; // Ray Query: enable hard shadows for direct lighting (0/1) + // Ray Query soft shadows (area-light approximation) + alignas(4) int shadowSampleCount; // 1 = hard shadows; >1 = multi-sample + alignas(4) float shadowSoftness; // 0 = hard; otherwise scales effective light radius (fraction of range) + alignas(4) float reflectionIntensity; // User control for glass reflection strength + alignas(4) float _padShadow[2]{}; +}; + +static_assert(sizeof(RayQueryUniformBufferObject) == 288, "RayQueryUniformBufferObject size must match shader layout"); +static_assert(offsetof(RayQueryUniformBufferObject, model) == 0); +static_assert(offsetof(RayQueryUniformBufferObject, view) == 64); +static_assert(offsetof(RayQueryUniformBufferObject, proj) == 128); +static_assert(offsetof(RayQueryUniformBufferObject, camPos) == 192); +static_assert(offsetof(RayQueryUniformBufferObject, exposure) == 208); +static_assert(offsetof(RayQueryUniformBufferObject, gamma) == 212); +static_assert(offsetof(RayQueryUniformBufferObject, scaleIBLAmbient) == 216); +static_assert(offsetof(RayQueryUniformBufferObject, lightCount) == 220); +static_assert(offsetof(RayQueryUniformBufferObject, enableRayQueryReflections) == 224); +static_assert(offsetof(RayQueryUniformBufferObject, enableRayQueryTransparency) == 228); +static_assert(offsetof(RayQueryUniformBufferObject, screenDimensions) == 232); +static_assert(offsetof(RayQueryUniformBufferObject, geometryInfoCount) == 240); +static_assert(offsetof(RayQueryUniformBufferObject, materialCount) == 244); +static_assert(offsetof(RayQueryUniformBufferObject, _pad0) == 248); +static_assert(offsetof(RayQueryUniformBufferObject, enableThickGlass) == 252); +static_assert(offsetof(RayQueryUniformBufferObject, thicknessClamp) == 256); +static_assert(offsetof(RayQueryUniformBufferObject, absorptionScale) == 260); +static_assert(offsetof(RayQueryUniformBufferObject, _pad1) == 264); +static_assert(offsetof(RayQueryUniformBufferObject, shadowSampleCount) == 268); +static_assert(offsetof(RayQueryUniformBufferObject, shadowSoftness) == 272); + +/** + * @brief Structure for PBR material properties. + * This structure must match the PushConstants structure in the PBR shader. + */ +struct MaterialProperties { + alignas(16) glm::vec4 baseColorFactor; + alignas(4) float metallicFactor; + alignas(4) float roughnessFactor; + alignas(4) int baseColorTextureSet; + alignas(4) int physicalDescriptorTextureSet; + alignas(4) int normalTextureSet; + alignas(4) int occlusionTextureSet; + alignas(4) int emissiveTextureSet; + alignas(4) float alphaMask; + alignas(4) float alphaMaskCutoff; + alignas(16) glm::vec3 emissiveFactor; // Emissive factor for HDR emissive sources + alignas(4) float emissiveStrength; // KHR_materials_emissive_strength extension + alignas(4) float transmissionFactor; // KHR_materials_transmission + alignas(4) int useSpecGlossWorkflow; // 1 if using KHR_materials_pbrSpecularGlossiness + alignas(4) float glossinessFactor; // SpecGloss glossiness scalar + alignas(16) glm::vec3 specularFactor; // SpecGloss specular color factor + alignas(4) float ior = 1.5f; // index of refraction + alignas(4) bool hasEmissiveStrengthExtension; +}; + +/** + * @brief Rendering mode selection + */ +enum class RenderMode { + Rasterization, // Traditional rasterization pipeline + RayQuery // Ray query compute shader +}; + +/** + * @brief Class for managing Vulkan rendering. + * + * This class implements the rendering pipeline as described in the Engine_Architecture chapter: + * @see en/Building_a_Simple_Engine/Engine_Architecture/05_rendering_pipeline.adoc + */ +class Renderer { + public: + /** + * @brief Constructor with a platform. + * @param platform The platform to use for rendering. + */ + explicit Renderer(Platform* platform); + + /** + * @brief Destructor for proper cleanup. + */ + ~Renderer(); + + /** + * @brief Initialize the renderer. + * @param appName The name of the application. + * @param enableValidationLayers Whether to enable validation layers. + * @return True if initialization was successful, false otherwise. + */ + bool Initialize(const std::string& appName, bool enableValidationLayers = true, bool debugSync = false); + + /** + * @brief Clean up renderer resources. + */ + void Cleanup(); + + /** + * @brief Render the scene. + * @param entities The entities to render. + * @param camera The camera to use for rendering. + * @param imguiSystem The ImGui system for UI rendering (optional). + */ + void Render(const std::vector>& entities, CameraComponent* camera, ImGuiSystem* imguiSystem = nullptr); + + // Render overload that accepts a snapshot of raw entity pointers. + // This allows the Engine to release its entity-container lock before rendering + // (avoiding writer starvation of background loading/physics threads). + void Render(const std::vector& entities, CameraComponent* camera, ImGuiSystem* imguiSystem = nullptr); + + /** + * @brief Wait for the device to be idle. + */ + void WaitIdle(); + + /** + * @brief Wait for fences with periodic watchdog kicks to prevent false hang detection. + * Must be called from the render thread. + */ + vk::Result waitForFencesSafe(const std::vector& fences, vk::Bool32 waitAll, uint64_t timeoutNs = 100'000'000ULL); + + /** + * @brief Wait for fences with periodic watchdog kicks to prevent false hang detection. + * Must be called from the render thread. Overload for a single fence. + */ + vk::Result waitForFencesSafe(vk::Fence fence, vk::Bool32 waitAll, uint64_t timeoutNs = 100'000'000ULL); + + /** + * @brief Dispatch a compute shader. + * @param groupCountX The number of local workgroups to dispatch in the X dimension. + * @param groupCountY The number of local workgroups to dispatch in the Y dimension. + * @param groupCountZ The number of local workgroups to dispatch in the Z dimension. + * @param inputBuffer The input buffer. + * @param outputBuffer The output buffer. + * @param hrtfBuffer The HRTF data buffer. + * @param paramsBuffer The parameters buffer. + * @return A fence that can be used to synchronize with the compute operation. + */ + vk::raii::Fence DispatchCompute(uint32_t groupCountX, + uint32_t groupCountY, + uint32_t groupCountZ, + vk::Buffer inputBuffer, + vk::Buffer outputBuffer, + vk::Buffer hrtfBuffer, + vk::Buffer paramsBuffer); + + /** + * @brief RAII helper to suppress watchdog aborts during known long-running operations. + */ + struct ScopedWatchdogSuppression { + Renderer* r; + explicit ScopedWatchdogSuppression(Renderer* rr) : r(rr) { + if (r) + r->watchdogSuppressed.store(true, std::memory_order_relaxed); + } + ~ScopedWatchdogSuppression() { + if (r) + r->watchdogSuppressed.store(false, std::memory_order_relaxed); + } + }; + + /** + * @brief Check if the renderer is initialized. + * @return True if the renderer is initialized, false otherwise. + */ + bool IsInitialized() const { + return initialized; + } + + /** + * @brief Get the Vulkan device. + * @return The Vulkan device. + */ + vk::Device GetDevice() const { + return *device; + } + + // Expose max frames in flight for per-frame resource duplication + uint32_t GetMaxFramesInFlight() const { + return MAX_FRAMES_IN_FLIGHT; + } + + /** + * @brief Get the Vulkan RAII device. + * @return The Vulkan RAII device. + */ + const vk::raii::Device& GetRaiiDevice() const { + return device; + } + + // Expose frame timeline semaphore and last value + vk::Semaphore GetFrameTimeline() const { + return *frameTimeline; + } + uint64_t GetCurrentTimelineValue() const { + return totalFrameCount.load(std::memory_order_relaxed) * 10; + } + uint64_t GetNextTimelineValue() const { + return totalFrameCount.load() + 1; + } + + // Expose uploads timeline semaphore and last value for external waits + vk::Semaphore GetUploadsTimelineSemaphore() const { + return *uploadsTimeline; + } + uint64_t GetNextUploadTimelineValue() { + return nextUploadTimelineValue.fetch_add(1, std::memory_order_relaxed) + 1; + } + + // Signal the frame timeline semaphore without submitting any work (used to break deadlocks) + void SignalFrameTimeline(uint64_t value) { + if (!*frameTimeline) return; + + vk::SemaphoreSubmitInfo signalInfo{ + .semaphore = *frameTimeline, + .value = value, + .stageMask = vk::PipelineStageFlagBits2::eAllCommands, + .deviceIndex = 0 + }; + vk::SubmitInfo2 submit2{ + .signalSemaphoreInfoCount = 1, + .pSignalSemaphoreInfos = &signalInfo + }; + Submit2(*graphicsQueue, submit2, nullptr); + } + + /** + * @brief Get the compute queue. + * @return The compute queue. + */ + vk::Queue GetComputeQueue() const { + std::lock_guard lock(queueMutex); + return *computeQueue; + } + + /** + * @brief Find a suitable memory type. + * @param typeFilter The type filter. + * @param properties The memory properties. + * @return The memory type index. + */ + uint32_t FindMemoryType(uint32_t typeFilter, vk::MemoryPropertyFlags properties) const { + return findMemoryType(typeFilter, properties); + } + + /** + * @brief Get the compute queue family index. + * @return The compute queue family index. + */ + uint32_t GetComputeQueueFamilyIndex() const { + if (queueFamilyIndices.computeFamily.has_value()) { + return queueFamilyIndices.computeFamily.value(); + } + // Fallback to graphics family to avoid crashes on devices without a separate compute queue + return queueFamilyIndices.graphicsFamily.value(); + } + + /** + * @brief Submit a command buffer to the compute queue with proper dispatch loader preservation. + * @param commandBuffer The command buffer to submit. + * @param fence The fence to signal when the operation completes. + */ + void SubmitToComputeQueue(vk::CommandBuffer commandBuffer, vk::Fence fence) const { + // Use mutex to ensure thread-safe access to queues + vk::SubmitInfo submitInfo{ + .commandBufferCount = 1, + .pCommandBuffers = &commandBuffer + }; + std::lock_guard lock(queueMutex); + // Prefer compute queue when available; otherwise, fall back to graphics queue to avoid crashes + if (*computeQueue) { + computeQueue.submit(submitInfo, fence); + } else { + graphicsQueue.submit(submitInfo, fence); + } + } + + /** + * @brief Unified entry point for all queue submissions using Synchronization 2. + * @param queue The queue to submit to. + * @param submitInfo The submission info. + * @param fence Optional fence to signal. + */ + void Submit2(vk::Queue queue, vk::SubmitInfo2 submitInfo, vk::Fence fence = nullptr) { + if (submitInfo.commandBufferInfoCount == 0 && submitInfo.signalSemaphoreInfoCount == 0 && submitInfo.waitSemaphoreInfoCount == 0 && !fence) return; + + std::lock_guard lock(queueMutex); + + // Create local mutable copies of semaphore infos to ensure changes are seen by the driver + std::vector signalInfos; + if (submitInfo.signalSemaphoreInfoCount > 0 && submitInfo.pSignalSemaphoreInfos) { + signalInfos.assign(submitInfo.pSignalSemaphoreInfos, submitInfo.pSignalSemaphoreInfos + submitInfo.signalSemaphoreInfoCount); + + VkSemaphore hFrame = (!!*frameTimeline) ? (VkSemaphore)*frameTimeline : VK_NULL_HANDLE; + VkSemaphore hUploads = (!!*uploadsTimeline) ? (VkSemaphore)*uploadsTimeline : VK_NULL_HANDLE; + + for (auto& si : signalInfos) { + VkSemaphore handle = (VkSemaphore)si.semaphore; + if (handle == VK_NULL_HANDLE) continue; + + if (hFrame != VK_NULL_HANDLE && handle == hFrame) { + uint64_t currentGpu = frameTimeline.getCounterValue(); + uint64_t lastSub = nextFrameTimelineValue.load(std::memory_order_relaxed); + uint64_t val = std::max(currentGpu, lastSub); + if (si.value <= val) { + si.value = val + 1; + } + nextFrameTimelineValue.store(si.value, std::memory_order_relaxed); + } else if (hUploads != VK_NULL_HANDLE && handle == hUploads) { + uint64_t currentGpu = uploadsTimeline.getCounterValue(); + uint64_t lastSub = nextUploadTimelineValue.load(std::memory_order_relaxed); + uint64_t val = std::max(currentGpu, lastSub); + if (si.value <= val) { + si.value = val + 1; + } + nextUploadTimelineValue.store(si.value, std::memory_order_relaxed); + } + } + + // IMPORTANT: Update the CALLER's memory so they know the assigned signal value. + for (uint32_t i = 0; i < submitInfo.signalSemaphoreInfoCount; ++i) { + const_cast(submitInfo.pSignalSemaphoreInfos)[i].value = signalInfos[i].value; + } + + // Re-point the submitInfo to our local updated array + submitInfo.pSignalSemaphoreInfos = signalInfos.data(); + } + + // Use the C-style API directly via Vulkan-HPP to ensure the driver sees our pointers + VkSubmitInfo2 vsi = static_cast(submitInfo); + VkResult res = VULKAN_HPP_DEFAULT_DISPATCHER.vkQueueSubmit2(static_cast(queue), 1, &vsi, static_cast(fence)); + if (res != VK_SUCCESS) { + std::cerr << "Error: vkQueueSubmit2 failed! Result: " << vk::to_string(static_cast(res)) << std::endl; + } + } + + /** + * @brief Submit a command buffer to the compute queue using Synchronization 2 and a signal timeline semaphore. + * @param commandBuffer The command buffer to submit. + * @param signalSemaphore The timeline semaphore to signal. + * @param signalValue The value to signal. + */ + void SubmitToComputeQueue2(vk::CommandBuffer commandBuffer, vk::Semaphore signalSemaphore, uint64_t signalValue) { + if (!commandBuffer) return; + + vk::CommandBufferSubmitInfo cmdInfo{.commandBuffer = commandBuffer}; + vk::SemaphoreSubmitInfo signalInfo{ + .semaphore = signalSemaphore, + .value = signalValue, + .stageMask = vk::PipelineStageFlagBits2::eComputeShader + }; + vk::SubmitInfo2 submit2{ + .commandBufferInfoCount = 1, + .pCommandBufferInfos = &cmdInfo, + .signalSemaphoreInfoCount = 1, + .pSignalSemaphoreInfos = &signalInfo + }; + + if (*computeQueue) { + Submit2(*computeQueue, submit2, nullptr); + } else { + Submit2(*graphicsQueue, submit2, nullptr); + } + } + + /** + * @brief Submit a command buffer to a queue using Synchronization 2 and an optional signal timeline semaphore. + * @param queue The queue to submit to. + * @param commandBuffer The command buffer to submit. + * @param signalUploadsTimeline Whether to signal the uploads timeline. + * @param outSignalValue Pointer to receive the signaled value. + * @param fence Optional fence to signal. + */ + void SubmitToQueue2(vk::Queue queue, vk::CommandBuffer commandBuffer, bool signalUploadsTimeline = false, uint64_t* outSignalValue = nullptr, vk::Fence fence = nullptr) { + if (!commandBuffer) return; + + vk::CommandBufferSubmitInfo cmdInfo{.commandBuffer = commandBuffer}; + vk::SemaphoreSubmitInfo signalInfo{}; + uint32_t signalCount = 0; + + if (signalUploadsTimeline && !!*uploadsTimeline) { + // Enforce strictly monotonic signals (will be handled by Submit2) + signalInfo = vk::SemaphoreSubmitInfo{ + .semaphore = *uploadsTimeline, + .value = 1, // Will be overridden by Submit2 + .stageMask = vk::PipelineStageFlagBits2::eAllCommands + }; + signalCount = 1; + } + + vk::SubmitInfo2 submit2{ + .commandBufferInfoCount = 1, + .pCommandBufferInfos = &cmdInfo, + .signalSemaphoreInfoCount = signalCount, + .pSignalSemaphoreInfos = (signalCount > 0) ? &signalInfo : nullptr + }; + + Submit2(queue, submit2, fence); + if (outSignalValue && signalCount > 0) { + *outSignalValue = signalInfo.value; + } + } + vk::raii::ShaderModule CreateShaderModule(const std::vector& code) { + return createShaderModule(code); + } + + /** + * @brief Create a shader module from a file. + * @param filename The filename. + * @return The shader module. + */ + vk::raii::ShaderModule CreateShaderModule(const std::string& filename) { + auto code = readFile(filename); + return createShaderModule(code); + } + + /** + * @brief Load a texture from a file. + * @param texturePath The path to the texture file. + * @return True if the texture was loaded successfully, false otherwise. + */ + bool LoadTexture(const std::string& texturePath); + + // Asynchronous texture loading APIs (thread-pool backed). + // The 'critical' flag is used to front-load important textures (e.g., + // baseColor/albedo) so the scene looks mostly correct before the loading + // screen disappears. Non-critical textures (normals, MR, AO, emissive) + // can stream in after geometry is visible. + std::future LoadTextureAsync(const std::string& texturePath, bool critical = false); + + /** + * @brief Load a texture from raw image data in memory. + * @param textureId The identifier for the texture. + * @param imageData The raw image data. + * @param width The width of the image. + * @param height The height of the image. + * @param channels The number of channels in the image. + * @return True if the texture was loaded successfully, false otherwise. + */ + bool LoadTextureFromMemory(const std::string& textureId, + const unsigned char* imageData, + int width, + int height, + int channels); + + // Asynchronous upload from memory (RGBA/RGB/other). Safe for concurrent calls. + std::future LoadTextureFromMemoryAsync(const std::string& textureId, + const unsigned char* imageData, + int width, + int height, + int channels, + bool critical = false); + + // Progress query for UI + uint32_t GetTextureTasksScheduled() const { + return textureTasksScheduled.load(); + } + uint32_t GetTextureTasksCompleted() const { + return textureTasksCompleted.load(); + } + + // GPU upload progress (per-texture jobs processed on the main thread). + uint32_t GetUploadJobsTotal() const { + return uploadJobsTotal.load(); + } + uint32_t GetUploadJobsCompleted() const { + return uploadJobsCompleted.load(); + } + + // --- Acceleration structure build progress (for UI) --- + // Exposed so the loading overlay can show meaningful progress when + // BLAS/TLAS builds take a long time (>= ~10 seconds). + bool IsASBuildInProgress() const { + return asBuildUiActive.load(std::memory_order_relaxed); + } + float GetASBuildProgress() const { + return asBuildUiProgress.load(std::memory_order_relaxed); + } + uint32_t GetASBuildItemsDone() const { + return asBuildUiDone.load(std::memory_order_relaxed); + } + uint32_t GetASBuildItemsTotal() const { + return asBuildUiTotal.load(std::memory_order_relaxed); + } + const char* GetASBuildStage() const { + return asBuildUiStage.load(std::memory_order_relaxed); + } + double GetASBuildElapsedSeconds() const { + const uint64_t start = asBuildUiStartNs.load(std::memory_order_relaxed); + if (start == 0) + return 0.0; + const uint64_t now = static_cast( + std::chrono::duration_cast( + std::chrono::steady_clock::now().time_since_epoch()) + .count()); + if (now <= start) + return 0.0; + return static_cast(now - start) / 1'000'000'000.0; + } + bool ShouldShowASBuildProgressInUI() const { + return IsASBuildInProgress() && GetASBuildElapsedSeconds() >= 10.0; + } + + // Block until all currently-scheduled texture tasks have completed. + // Intended for use during initial scene loading so that descriptor + // creation sees the final textureResources instead of fallbacks. + void WaitForAllTextureTasks(); + + // Process pending texture GPU uploads on the calling thread. + // This should be invoked from the main/render thread so that all + // Vulkan work happens from a single thread while worker threads + // perform only CPU-side decoding. + // + // Parameters allow us to: + // - limit the number of jobs processed per call (for streaming), and + // - choose whether to include critical and/or non-critical jobs. + void ProcessPendingTextureJobs(uint32_t maxJobs = UINT32_MAX, + bool includeCritical = true, + bool includeNonCritical = true); + + // Track which entities use a given texture ID so that descriptor sets + // can be refreshed when textures finish streaming in. + void RegisterTextureUser(const std::string& textureId, Entity* entity); + void OnTextureUploaded(const std::string& textureId); + + std::atomic totalLoadingItems{0}; + std::atomic completedLoadingItems{0}; + + // Global loading state (model/scene). Consider the scene "loading" while + // either the model is being parsed/instantiated OR there are still + // outstanding critical texture uploads (e.g., baseColor/albedo). + // Loading state: show blocking loading overlay only until the initial scene is ready. + // Background streaming may continue after that without blocking the scene. + enum class LoadingPhase : uint32_t { + Scene = 0, + Textures, + Physics, + AccelerationStructures, + Finalizing + }; + + enum class InternalLoadingState { + Parsing, // Background thread loading GLTF + Preallocating, // Render thread creating GPU resources + PhysicsInit, // Initializing physics/base textures + Play // Game ready + }; + LoadingPhase GetLoadingPhase() const { + return static_cast(loadingPhase.load(std::memory_order_relaxed)); + } + const char* GetLoadingPhaseName() const { + switch (GetLoadingPhase()) { + case LoadingPhase::Scene: + return "Scene"; + case LoadingPhase::Textures: + return "Textures"; + case LoadingPhase::Physics: + return "Physics"; + case LoadingPhase::AccelerationStructures: + return "Acceleration Structures"; + case LoadingPhase::Finalizing: + return "Finalizing"; + default: + return "Loading"; + } + } + float GetLoadingPhaseProgress() const { + return std::clamp(loadingPhaseProgress.load(std::memory_order_relaxed), 0.0f, 1.0f); + } + void SetLoadingPhase(LoadingPhase phase) { + if (loadingPhase.load(std::memory_order_relaxed) != static_cast(phase)) { + loadingPhase.store(static_cast(phase), std::memory_order_relaxed); + loadingPhaseProgress.store(0.0f, std::memory_order_relaxed); + } + } + void SetLoadingPhaseProgress(float v) { + loadingPhaseProgress.store(std::clamp(v, 0.0f, 1.0f), std::memory_order_relaxed); + } + + // Smooth monotonic progress tracking across multiple loaders/phases + void AddLoadingWorkItems(uint32_t count) { + totalLoadingItems.fetch_add(count, std::memory_order_relaxed); + } + void CompleteLoadingWorkItems(uint32_t count) { + completedLoadingItems.fetch_add(count, std::memory_order_relaxed); + } + float GetGlobalLoadingProgress() const { + uint32_t total = totalLoadingItems.load(std::memory_order_relaxed); + uint32_t done = completedLoadingItems.load(std::memory_order_relaxed); + uint32_t texSched = textureTasksScheduled.load(std::memory_order_relaxed); + uint32_t texDone = textureTasksCompleted.load(std::memory_order_relaxed); + + uint32_t finalTotal = total + texSched; + uint32_t finalDone = done + texDone; + + if (finalTotal == 0) return 0.0f; + return std::clamp(static_cast(finalDone) / static_cast(finalTotal), 0.0f, 1.0f); + } + + void MarkInitialLoadComplete() { + initialLoadComplete.store(true, std::memory_order_release); + SetLoadingPhase(LoadingPhase::Finalizing); + loadingPhaseProgress.store(1.0f, std::memory_order_release); + } + void SetInternalLoadingState(InternalLoadingState state) { + currentInternalLoadingState.store(state, std::memory_order_release); + } + InternalLoadingState GetInternalLoadingState() const { + return currentInternalLoadingState.load(std::memory_order_acquire); + } + bool IsLoading() const { + // Stay in loading state until: + // 1. The state machine hits Play + // 2. The geometry preallocation queue is empty + // 3. All pending mesh uploads (vertex/index data) are finished + // This ensures we keep the high preallocation budget (100/frame) and blocking UI until everything is truly ready on the GPU. + return currentInternalLoadingState.load(std::memory_order_relaxed) != InternalLoadingState::Play || + pendingEntityPreallocQueued.load(std::memory_order_relaxed) || + HasPendingMeshUploads(); + } + // True only while the model/scene is still being constructed or while critical + // texture jobs remain outstanding. This excludes the "finalizing" stage where + // the render thread may still be doing post-load work (AS build, descriptor init). + // + // IMPORTANT: Do NOT use critical texture completion as a gate for starting TLAS/BLAS builds. + // AS builds depend on geometry buffers and instance transforms, not on texture readiness. + bool IsSceneLoaderActive() const { + return loadingFlag.load(std::memory_order_relaxed); + } + void SetLoading(bool v) { + std::cout << "DEBUG: SetLoading(" << (v ? "true" : "false") << ")" << std::endl; + loadingFlag.store(v, std::memory_order_release); + if (v) { + currentInternalLoadingState.store(InternalLoadingState::Parsing, std::memory_order_release); + initialLoadStarted.store(true, std::memory_order_relaxed); + initialLoadComplete.store(false, std::memory_order_release); + SetLoadingPhase(LoadingPhase::Scene); + loadingPhaseProgress.store(0.0f, std::memory_order_relaxed); + } else { + // Switch from Parsing to Preallocating + if (currentInternalLoadingState.load() == InternalLoadingState::Parsing) { + currentInternalLoadingState.store(InternalLoadingState::Preallocating, std::memory_order_release); + } + } + } + + // Descriptor set deferred update machinery + void MarkEntityDescriptorsDirty(Entity *entity); + void ProcessDirtyDescriptorsForFrame(uint32_t frameIndex); + + // Texture aliasing: map canonical IDs to actual loaded keys (e.g., file paths) to avoid duplicates + inline void RegisterTextureAlias(const std::string& aliasId, const std::string& targetId) { + std::unique_lock lock(textureResourcesMutex); + if (aliasId.empty() || targetId.empty()) + return; + // Resolve targetId without re-locking by walking the alias map directly + std::string resolved = targetId; + for (int i = 0; i < 8; ++i) { + auto it = textureAliases.find(resolved); + if (it == textureAliases.end()) + break; + if (it->second == resolved) + break; + resolved = it->second; + } + if (aliasId == resolved) { + textureAliases.erase(aliasId); + } else { + textureAliases[aliasId] = resolved; + } + } + inline std::string ResolveTextureId(const std::string& id) const { + std::shared_lock lock(textureResourcesMutex); + std::string cur = id; + for (int i = 0; i < 8; ++i) { + // prevent pathological cycles + auto it = textureAliases.find(cur); + if (it == textureAliases.end()) + break; + if (it->second == cur) + break; // self-alias guard + cur = it->second; + } + return cur; + } + + /** + * @brief Transition an image layout. + * @param image The image. + * @param format The image format. + * @param oldLayout The old layout. + * @param newLayout The new layout. + */ + void TransitionImageLayout(vk::Image image, vk::Format format, vk::ImageLayout oldLayout, vk::ImageLayout newLayout) { + transitionImageLayout(image, format, oldLayout, newLayout); + } + + /** + * @brief Copy a buffer to an image. + * @param buffer The buffer. + * @param image The image. + * @param width The image width. + * @param height The image height. + */ + void CopyBufferToImage(vk::Buffer buffer, vk::Image image, uint32_t width, uint32_t height) { + // Create a default single region for backward compatibility + std::vector regions = { + { + .bufferOffset = 0, + .bufferRowLength = 0, + .bufferImageHeight = 0, + .imageSubresource = { + .aspectMask = vk::ImageAspectFlagBits::eColor, + .mipLevel = 0, + .baseArrayLayer = 0, + .layerCount = 1 + }, + .imageOffset = {0, 0, 0}, + .imageExtent = {width, height, 1} + } + }; + copyBufferToImage(buffer, image, width, height, regions); + } + + /** + * @brief Get the current command buffer. + * @return The current command buffer. + */ + vk::raii::CommandBuffer& GetCurrentCommandBuffer() { + return commandBuffers[currentFrame]; + } + + /** + * @brief Get the swap chain image format. + * @return The swap chain image format. + */ + vk::Format GetSwapChainImageFormat() const { + return swapChainImageFormat; + } + + /** + * @brief Set the framebuffer resized flag. + * This should be called when the window is resized to trigger swap chain recreation. + */ + void SetFramebufferResized() { + framebufferResized.store(true, std::memory_order_relaxed); + } + + /** + * @brief Set the model loader reference for accessing extracted lights. + * @param _modelLoader Pointer to the model loader. + */ + void SetModelLoader(ModelLoader* _modelLoader) { + modelLoader = _modelLoader; + // Materials are resolved via ModelLoader; invalidate cached per-entity material info. + for (auto& kv : entityResources) { + kv.second.materialCacheValid = false; + kv.second.cachedMaterial = nullptr; + kv.second.cachedIsBlended = false; + kv.second.cachedIsGlass = false; + kv.second.cachedIsLiquid = false; + kv.second.cachedMaterialProps = MaterialProperties{}; + } + } + + /** + * @brief Set static lights loaded during model initialization. + * @param lights The lights to store statically. + */ + void SetStaticLights(const std::vector& lights) { + staticLights = lights; + std::cout << "[Lights] staticLights set: " << staticLights.size() << " entries" << std::endl; + } + + /** + * @brief Set the gamma correction value for PBR rendering. + * @param _gamma The gamma correction value (typically 2.2). + */ + void SetGamma(float _gamma) { + gamma = _gamma; + } + + /** + * @brief Set the exposure value for HDR tone mapping. + * @param _exposure The exposure value (1.0 = no adjustment). + */ + void SetExposure(float _exposure) { + exposure = _exposure; + } + + // Reflection intensity (UI + shader control) + void SetReflectionIntensity(float v) { + reflectionIntensity = v; + } + float GetReflectionIntensity() const { + return reflectionIntensity; + } + + void SetPlanarReflectionsEnabled(bool enabled); + void TogglePlanarReflections(); + bool IsPlanarReflectionsEnabled() const { + return enablePlanarReflections; + } + + // Ray query rendering mode control + void SetRenderMode(RenderMode mode) { + currentRenderMode = mode; + } + RenderMode GetRenderMode() const { + return currentRenderMode; + } + void ToggleRenderMode() { + currentRenderMode = (currentRenderMode == RenderMode::Rasterization) ? RenderMode::RayQuery : RenderMode::Rasterization; + } + + // Ray query capability getters + bool GetRayQueryEnabled() const { + return rayQueryEnabled; + } + bool GetAccelerationStructureEnabled() const { + return accelerationStructureEnabled; + } + + // Ray Query static-only mode (disable animation/physics updates and TLAS refits to render a static opaque scene) + void SetRayQueryStaticOnly(bool v) { + rayQueryStaticOnly = v; + } + bool IsRayQueryStaticOnly() const { + return rayQueryStaticOnly; + } + + /** + * @brief Request acceleration structure build at next safe frame point. + * Safe to call from any thread (e.g., background loading thread). + */ + void RequestAccelerationStructureBuild() { + if (!accelerationStructureEnabled || !rayQueryEnabled) + return; + // Record when the request was made so the render loop can enforce a bounded deferral + // policy (avoid getting stuck waiting for “perfect” readiness forever). + // NOTE: `asBuildRequested` may already be true due to other triggers; still ensure + // the request timestamp is armed so the timeout logic can work. + if (asBuildRequestStartNs.load(std::memory_order_relaxed) == 0) { + const uint64_t nowNs = static_cast( + std::chrono::duration_cast( + std::chrono::steady_clock::now().time_since_epoch()) + .count()); + asBuildRequestStartNs.store(nowNs, std::memory_order_relaxed); + } + // Allow AS build to take longer than the watchdog threshold (large scenes in Debug). + watchdogSuppressed.store(true, std::memory_order_relaxed); + asBuildRequested.store(true, std::memory_order_release); + } + // Overload with reason tracking for diagnostics + void KickWatchdog(); + void RequestAccelerationStructureBuild(const char* reason) { + if (!accelerationStructureEnabled || !rayQueryEnabled) + return; + if (asBuildRequestStartNs.load(std::memory_order_relaxed) == 0) { + const uint64_t nowNs = static_cast( + std::chrono::duration_cast( + std::chrono::steady_clock::now().time_since_epoch()) + .count()); + asBuildRequestStartNs.store(nowNs, std::memory_order_relaxed); + } + if (reason) { + lastASBuildRequestReason = reason; + std::cout << "[AS] Requesting rebuild. Reason: " << reason << std::endl; + } else { + lastASBuildRequestReason = "(no reason)"; + } + + // Explicit requests bypass the freeze to ensure dynamic objects (like balls) are added + asDevOverrideAllowRebuild = true; + + watchdogSuppressed.store(true, std::memory_order_relaxed); + asBuildRequested.store(true, std::memory_order_release); + } + + /** + * @brief Build acceleration structures for ray query rendering. + * @param entities The entities to include in the acceleration structures. + * @return True if successful, false otherwise. + */ + bool buildAccelerationStructures(const std::vector& entities); + + // Refit/UPDATE the TLAS with latest entity transforms (no rebuild) + bool refitTopLevelAS(const std::vector& entities, CameraComponent* camera); + + /** + * @brief Update ray query descriptor sets with current resources. + * @param frameIndex The frame index to update (or all frames if not specified). + * @return True if successful, false otherwise. + */ + bool updateRayQueryDescriptorSets(uint32_t frameIndex, const std::vector& entities); + + /** + * @brief Create or resize light storage buffers to accommodate the given number of lights. + * @param lightCount The number of lights to accommodate. + * @return True if successful, false otherwise. + */ + bool createOrResizeLightStorageBuffers(size_t lightCount); + + /** + * @brief Update the light storage buffer with current light data. + * @param frameIndex The current frame index. + * @param lights The light data to upload. + * @return True if successful, false otherwise. + */ + bool updateLightStorageBuffer(uint32_t frameIndex, const std::vector& lights, CameraComponent* camera = nullptr); + + /** + * @brief Update all existing descriptor sets with new light storage buffer references. + * Called when light storage buffers are recreated to ensure descriptor sets reference valid buffers. + */ + // Update PBR descriptor sets to point to the latest light SSBOs. + // When allFrames=true, refresh all frames (use only when the device is idle — e.g., after waitIdle()). + // Otherwise, refresh only the current frame at the frame safe point to avoid touching in‑flight frames. + void updateAllDescriptorSetsWithNewLightBuffers(bool allFrames = false); + + // Upload helper: record both layout transitions and the copy in a single submit with a fence + void uploadImageFromStaging(vk::Buffer staging, + vk::Image image, + vk::Format format, + const std::vector& regions, + uint32_t mipLevels = 1); + + // Generate full mip chain for a 2D color image using GPU blits + void generateMipmaps(vk::Image image, + vk::Format format, + int32_t texWidth, + int32_t texHeight, + uint32_t mipLevels); + + vk::Format findDepthFormat(); + + /** + * @brief Pre-allocate all Vulkan resources for an entity during scene loading. + * @param entity The entity to pre-allocate resources for. + * @return True if pre-allocation was successful, false otherwise. + */ + bool preAllocateEntityResources(Entity* entity); + + /** + * @brief Pre-allocate Vulkan resources for a batch of entities, batching mesh uploads. + * + * This variant is optimized for large scene loads (e.g., GLTF Bistro). It will: + * - Create per-mesh GPU buffers as usual, but record all buffer copy commands + * into a single command buffer and submit them in one batch. + * - Then create uniform buffers and descriptor sets per entity. + * + * Callers that load many geometry entities at once (like GLTF scene loading) + * should prefer this over repeated preAllocateEntityResources() calls. + */ + bool preAllocateEntityResourcesBatch(const std::vector& entities); + + // Thread-safe: enqueue entities that need GPU-side resource preallocation. + // The actual Vulkan work will be performed on the render thread at the frame-start safe point. + void EnqueueEntityPreallocationBatch(const std::vector& entities); + void EnqueueInstanceBufferRecreation(Entity* entity); + bool HasPendingPreallocations() const { + return pendingEntityPreallocQueued.load(std::memory_order_relaxed); + } + void ProcessPendingEntityPreallocations(); + + /** + * @brief Recreate the instance buffer for an entity that had its instances cleared. + * + * When an entity that was originally set up for instanced rendering needs to be + * converted to a single non-instanced entity (e.g., for animation), this method + * recreates the GPU instance buffer with a single identity instance. + * + * @param entity The entity whose instance buffer should be recreated. + * @return True if successful, false otherwise. + */ + bool recreateInstanceBuffer(Entity* entity); + + // Shared default PBR texture identifiers (to avoid creating hundreds of identical textures) + static const std::string SHARED_DEFAULT_ALBEDO_ID; + static const std::string SHARED_DEFAULT_NORMAL_ID; + static const std::string SHARED_DEFAULT_METALLIC_ROUGHNESS_ID; + static const std::string SHARED_DEFAULT_OCCLUSION_ID; + static const std::string SHARED_DEFAULT_EMISSIVE_ID; + static const std::string SHARED_BRIGHT_RED_ID; + + /** + * @brief Determine the appropriate texture format based on the texture type. + * @param textureId The texture identifier to analyze. + * @return The appropriate Vulkan format (sRGB for baseColor, linear for others). + */ + static vk::Format determineTextureFormat(const std::string& textureId); + + public: + // Milestone values for frameTimeline + struct TimelineMilestones { + static constexpr uint64_t eFrameStart = 0; + static constexpr uint64_t ePhysicsFinished = 1; + static constexpr uint64_t eCullingFinished = 2; + static constexpr uint64_t eGpuWorkFinished = 3; + }; + + private: + // Platform + Platform* platform = nullptr; + + // Model loader reference for accessing extracted lights + class ModelLoader* modelLoader = nullptr; + + // PBR rendering parameters + float gamma = 2.2f; // Gamma correction value + float exposure = 1.2f; // HDR exposure value (default tuned to avoid washout) + float reflectionIntensity = 1.0f; // User control for glass reflection strength + // Raster shadows (experimental): use ray queries in the raster PBR fragment shader. + // Wired through `UniformBufferObject.padding2` to avoid UBO layout churn. + bool enableRasterRayQueryShadows = false; + + // Ray Query tuning + int rayQueryMaxBounces = 1; // 0 = no secondary rays, 1 = one-bounce reflection/refraction + bool enableRayQueryShadows = true; // Hard shadows for Ray Query direct lighting (shadow rays) + int rayQueryShadowSampleCount = 1; // 1 = hard; >1 enables soft-shadow sampling in the shader + float rayQueryShadowSoftness = 0.0f; // 0 = hard; otherwise scales effective light radius (fraction of range) + // Thick-glass controls (RQ-only) + bool enableThickGlass = true; + float thickGlassAbsorptionScale = 1.0f; + float thickGlassThicknessClamp = 0.2f; // meters + + // Vulkan RAII context + vk::raii::Context context; + + // Vulkan instance and debug messenger + vk::raii::Instance instance = nullptr; + vk::raii::DebugUtilsMessengerEXT debugMessenger = nullptr; + + // Vulkan device + vk::raii::PhysicalDevice physicalDevice = nullptr; + vk::raii::Device device = nullptr; + + // Timeline Synchronization + vk::raii::Semaphore frameTimeline{nullptr}; + std::atomic totalFrameCount{0}; + uint64_t currentTimelineValue{0}; // The value being signaled by the current frame's GPU work + std::atomic lastCompletedFrame{0}; + + // Monotonic trackers for timeline signals + std::atomic nextFrameTimelineValue{0}; + std::atomic nextUploadTimelineValue{0}; + + + // Memory pool for efficient memory management + std::unique_ptr memoryPool; + + // Acceleration structure properties + vk::PhysicalDeviceAccelerationStructurePropertiesKHR accelStructProperties; + + // Vulkan queues + vk::raii::Queue graphicsQueue = nullptr; + vk::raii::Queue presentQueue = nullptr; + vk::raii::Queue computeQueue = nullptr; + + // Vulkan surface + vk::raii::SurfaceKHR surface = nullptr; + + // Swap chain + vk::raii::SwapchainKHR swapChain = nullptr; + std::vector swapChainImages; + vk::Format swapChainImageFormat = vk::Format::eUndefined; + vk::Extent2D swapChainExtent = {0, 0}; + std::vector swapChainImageViews; + // Tracked layouts for swapchain images (VVL requires correct oldLayout in barriers). + // Initialized at swapchain creation and updated as we transition. + std::vector swapChainImageLayouts; + + // Dynamic rendering info + vk::RenderingInfo renderingInfo; + std::vector colorAttachments; + vk::RenderingAttachmentInfo depthAttachment; + + // Pipelines + vk::raii::PipelineLayout pipelineLayout = nullptr; + vk::raii::Pipeline graphicsPipeline = nullptr; + vk::raii::PipelineLayout pbrPipelineLayout = nullptr; + vk::raii::Pipeline pbrGraphicsPipeline = nullptr; + vk::raii::Pipeline pbrBlendGraphicsPipeline = nullptr; + // Transparent PBR pipeline variant for premultiplied alpha content + vk::raii::Pipeline pbrPremulBlendGraphicsPipeline = nullptr; + // Opaque PBR pipeline variant used after a depth pre-pass (depth read-only, compare with pre-pass depth) + vk::raii::Pipeline pbrPrepassGraphicsPipeline = nullptr; + // Reflection PBR pipeline used for mirrored off-screen pass (cull none to avoid winding issues) + vk::raii::Pipeline pbrReflectionGraphicsPipeline = nullptr; + // Specialized pipeline for architectural glass (windows, lamp glass, etc.). + // Shares descriptor layouts and vertex input with the PBR pipelines but uses + // a dedicated fragment shader entry point for more stable glass shading. + vk::raii::Pipeline glassGraphicsPipeline = nullptr; + vk::raii::PipelineLayout lightingPipelineLayout = nullptr; + vk::raii::Pipeline lightingPipeline = nullptr; + + // Fullscreen composite pipeline to draw the opaque off-screen color to the swapchain + // (used to avoid gamma-incorrect vkCmdCopyImage and to apply tone mapping when desired). + vk::raii::PipelineLayout compositePipelineLayout = nullptr; + vk::raii::Pipeline compositePipeline = nullptr; + vk::raii::DescriptorSetLayout compositeDescriptorSetLayout = nullptr; // not used; reuse transparentDescriptorSetLayout + std::vector compositeDescriptorSets; // unused; reuse transparentDescriptorSets + + // Pipeline rendering create info structures (for proper lifetime management) + vk::PipelineRenderingCreateInfo mainPipelineRenderingCreateInfo; + vk::PipelineRenderingCreateInfo pbrPipelineRenderingCreateInfo; + vk::PipelineRenderingCreateInfo lightingPipelineRenderingCreateInfo; + vk::PipelineRenderingCreateInfo compositePipelineRenderingCreateInfo; + vk::PipelineRenderingCreateInfo rayQueryPipelineRenderingCreateInfo; + + // Create composite pipeline + bool createCompositePipeline(); + + // Compute pipeline + vk::raii::PipelineLayout computePipelineLayout = nullptr; + vk::raii::Pipeline computePipeline = nullptr; + vk::raii::DescriptorSetLayout computeDescriptorSetLayout = nullptr; + vk::raii::DescriptorPool computeDescriptorPool = nullptr; + std::vector computeDescriptorSets; + vk::raii::CommandPool computeCommandPool = nullptr; + + // Thread safety for queue access - unified mutex since queues may share the same underlying VkQueue + mutable std::mutex queueMutex; + // Thread safety for descriptor pool/set operations across all engine threads + mutable std::mutex descriptorMutex; + // Monotonic generation counter for descriptor pool rebuilds (future use for hardening) + std::atomic descriptorPoolGeneration{0}; + + // Command pool and buffers + vk::raii::CommandPool commandPool = nullptr; + std::vector commandBuffers; + // Protect usage of shared commandPool for transient command buffers + mutable std::mutex commandMutex; + + // Dedicated transfer queue (falls back to graphics if unavailable) + vk::raii::Queue transferQueue = nullptr; + + // Synchronization objects + std::vector imageAvailableSemaphores; + std::vector renderFinishedSemaphores; + std::vector inFlightFences; + + // Upload timeline semaphore for transfer -> graphics handoff (signaled per upload) + vk::raii::Semaphore uploadsTimeline = nullptr; + // Tracks last timeline value that has been submitted for signaling on uploadsTimeline (obsolete, using nextUploadTimelineValue) + // std::atomic uploadTimelineLastSubmitted{0}; + + // Depth buffer + vk::raii::Image depthImage = nullptr; + std::unique_ptr depthImageAllocation = nullptr; + vk::raii::ImageView depthImageView = nullptr; + + // Forward+ configuration + bool useForwardPlus = true; // default enabled + uint32_t forwardPlusTileSizeX = 16; + uint32_t forwardPlusTileSizeY = 16; + uint32_t forwardPlusSlicesZ = 16; // clustered depth slices + static constexpr uint32_t MAX_LIGHTS_PER_TILE = 256; // conservative cap + + struct TileHeader { + uint32_t offset; // into tileLightIndices + uint32_t count; // number of indices for this tile + uint32_t pad0; + uint32_t pad1; + }; + + struct ForwardPlusPerFrame { + // SSBOs for per-tile light lists + vk::raii::Buffer tileHeaders = nullptr; + std::unique_ptr tileHeadersAlloc = nullptr; + vk::raii::Buffer tileLightIndices = nullptr; + std::unique_ptr tileLightIndicesAlloc = nullptr; + size_t tilesCapacity = 0; // number of tiles allocated + size_t indicesCapacity = 0; // number of indices allocated + + // Uniform buffer with view/proj, screen size, tile size, etc. + vk::raii::Buffer params = nullptr; + std::unique_ptr paramsAlloc = nullptr; + void* paramsMapped = nullptr; + + // Optional compute debug output buffer (uints), host-visible + vk::raii::Buffer debugOut = nullptr; + std::unique_ptr debugOutAlloc = nullptr; + bool debugOutAwaitingReadback = false; + + // One-frame color probes (host-visible, small buffers) + vk::raii::Buffer probeOffscreen = nullptr; + std::unique_ptr probeOffscreenAlloc = nullptr; + vk::raii::Buffer probeSwapchain = nullptr; + std::unique_ptr probeSwapchainAlloc = nullptr; + bool probeAwaitingReadback = false; + + // Compute descriptor set for culling + vk::raii::DescriptorSet computeSet = nullptr; + }; + std::vector forwardPlusPerFrame; // size MAX_FRAMES_IN_FLIGHT + // Per-frame light count used by shaders (set once before main pass) + uint32_t lastFrameLightCount = 0; + + // Forward+ compute resources + vk::raii::PipelineLayout forwardPlusPipelineLayout = nullptr; + vk::raii::Pipeline forwardPlusPipeline = nullptr; + vk::raii::DescriptorSetLayout forwardPlusDescriptorSetLayout = nullptr; + + // Depth pre-pass pipeline + vk::raii::Pipeline depthPrepassPipeline = nullptr; + + // Ray query rendering mode + RenderMode currentRenderMode = RenderMode::RayQuery; + + // Ray query pipeline and resources + vk::raii::PipelineLayout rayQueryPipelineLayout = nullptr; + vk::raii::Pipeline rayQueryPipeline = nullptr; + vk::raii::DescriptorSetLayout rayQueryDescriptorSetLayout = nullptr; + std::vector rayQueryDescriptorSets; + // Track when the ray query descriptor set for each frame has been written. + // Updating binding 6 (large texture table) can be expensive; avoid doing it every frame. + std::vector rayQueryDescriptorsWritten; // size = MAX_FRAMES_IN_FLIGHT + // Bitmask of frames whose ray query descriptor set needs a refresh (e.g., after TLAS rebuild or texture upload). + std::atomic rayQueryDescriptorsDirtyMask{0}; + + // Dedicated ray query UBO (one per frame in flight) - separate from entity UBOs + std::vector rayQueryUniformBuffers; + std::vector> rayQueryUniformAllocations; + std::vector rayQueryUniformBuffersMapped; + + // Ray query output image (storage image for compute shader output) + vk::raii::Image rayQueryOutputImage = nullptr; + std::unique_ptr rayQueryOutputImageAllocation = nullptr; + vk::raii::ImageView rayQueryOutputImageView = nullptr; + + // Acceleration structures for ray query + struct AccelerationStructure { + vk::raii::Buffer buffer = nullptr; + std::unique_ptr allocation = nullptr; + vk::raii::AccelerationStructureKHR handle = nullptr; // Use RAII for proper lifetime management + vk::DeviceAddress deviceAddress = 0; + }; + std::vector blasStructures; // Bottom-level AS (one per mesh) + AccelerationStructure tlasStructure; // Top-level AS (scene) + + // Deferred deletion queue for old AS structures + // Keeps old AS buffers alive until all frames in flight have finished using them + struct PendingASDelete { + std::vector blasStructures; + AccelerationStructure tlasStructure; + uint64_t timelineValue = 0; // The timeline value when this AS can be safely deleted + }; + std::vector pendingASDeletions; + + // GPU data structures for ray query proper normal and material access + struct GeometryInfo { + uint64_t vertexBufferAddress; // Device address of vertex buffer + uint64_t indexBufferAddress; // Device address of index buffer + uint32_t vertexCount; // Number of vertices + uint32_t materialIndex; // Index into material buffer + uint32_t indexCount; // Number of indices (to bound primitiveIndex in shader) + uint32_t _pad0; + // Instance-space -> world-space normal transform (3 columns). Matches raster convention. + // Stored as float4 columns (xyz used, w unused) for stable std430 layout. + alignas(16) glm::vec4 normalMatrix0; + alignas(16) glm::vec4 normalMatrix1; + alignas(16) glm::vec4 normalMatrix2; + }; + + struct MaterialData { + alignas(16) glm::vec3 albedo; + alignas(4) float metallic; + alignas(16) glm::vec3 emissive; + alignas(4) float roughness; + alignas(4) float ao; + alignas(4) float ior; + alignas(4) float emissiveStrength; + alignas(4) float alpha; + alignas(4) float transmissionFactor; + alignas(4) float alphaCutoff; + // glTF alpha mode encoding (matches shader): 0=OPAQUE, 1=MASK, 2=BLEND + alignas(4) int32_t alphaMode; + alignas(4) uint32_t isGlass; // bool as uint32 + alignas(4) uint32_t isLiquid; // bool as uint32 + + // Thick-glass parameters (RQ-only) + alignas(16) glm::vec3 absorptionColor{1.0f, 1.0f, 1.0f}; + alignas(4) float absorptionDistance = 1.0f; // meters + alignas(4) uint32_t thinWalled = 1u; // 1 = thin surface, 0 = thick volume + + // Raster parity: texture-set flags (-1 = no texture; 0 = sample from binding 6 table). + // Ray Query uses a single texture table (binding 6); indices are always valid even when + // the set flag is -1, so the shader can choose the correct no-texture behavior. + alignas(4) int32_t baseColorTextureSet; + alignas(4) int32_t physicalDescriptorTextureSet; + alignas(4) int32_t normalTextureSet; + alignas(4) int32_t occlusionTextureSet; + alignas(4) int32_t emissiveTextureSet; + + // Ray Query texture table indices (binding 6). These always reference a valid descriptor + // (real streamed texture or a shared default slot). + alignas(4) int32_t baseColorTexIndex; + alignas(4) int32_t normalTexIndex; + alignas(4) int32_t physicalTexIndex; // metallic-roughness (default) or spec-gloss when useSpecGlossWorkflow=1 + alignas(4) int32_t occlusionTexIndex; + alignas(4) int32_t emissiveTexIndex; + + // Specular-glossiness workflow support (KHR_materials_pbrSpecularGlossiness) + alignas(4) int32_t useSpecGlossWorkflow; // 1 if SpecGloss + alignas(4) float glossinessFactor; + alignas(16) glm::vec3 specularFactor; + alignas(4) int32_t hasEmissiveStrengthExt; + alignas(4) uint32_t _padMat[3]; + }; + + // Ray query geometry and material buffers + vk::raii::Buffer geometryInfoBuffer = nullptr; + std::unique_ptr geometryInfoAllocation = nullptr; + vk::raii::Buffer materialBuffer = nullptr; + std::unique_ptr materialAllocation = nullptr; + + // Ray query baseColor texture array (binding 6) + static constexpr uint32_t RQ_MAX_TEX = 2048; + // Reserved slots in the Ray Query texture table (binding 6) + static constexpr uint32_t RQ_SLOT_DEFAULT_BASECOLOR = 0; + static constexpr uint32_t RQ_SLOT_DEFAULT_NORMAL = 1; + static constexpr uint32_t RQ_SLOT_DEFAULT_METALROUGH = 2; + static constexpr uint32_t RQ_SLOT_DEFAULT_OCCLUSION = 3; + static constexpr uint32_t RQ_SLOT_DEFAULT_EMISSIVE = 4; + // NOTE: Textures can stream in asynchronously and their underlying VkImageView/VkSampler + // can be destroyed/recreated. Therefore, the Ray Query texture table must NOT cache + // VkDescriptorImageInfo (which contains raw handles). Instead, cache only the canonical + // texture key per slot and rebuild VkDescriptorImageInfo each descriptor update. + // + // Slots 0..4 are reserved for shared default PBR textures. + std::vector rayQueryTexKeys; // slot -> canonical texture key + std::vector rayQueryTexFallbackSlots; // slot -> fallback slot (type-appropriate default) + uint32_t rayQueryTexCount = 0; // number of valid slots in rayQueryTexKeys + std::unordered_map rayQueryTexIndex; // canonicalKey -> slot + + // Per-material texture path mapping captured at AS build time; used for streaming requests + // and debugging, but Ray Query primarily uses per-material texture indices. + struct RQMaterialTexPaths { + std::string baseColor; + std::string normal; + std::string physical; + std::string occlusion; + std::string emissive; + }; + std::vector rqMaterialTexPaths; + + // Count of GeometryInfo instances currently uploaded (CPU-side tracking) + size_t geometryInfoCountCPU = 0; + // Count of materials currently uploaded (CPU-side tracking) + size_t materialCountCPU = 0; + + // --- Pending GPU uploads (to be executed on the render thread safe point) --- + mutable std::mutex pendingMeshUploadsMutex; + std::vector pendingMeshUploads; // meshes with staged data to copy + + struct InFlightMeshUploadBatch { + uint64_t signalValue = 0; + std::vector meshes; + std::unique_ptr commandPool; + std::unique_ptr commandBuffers; + }; + mutable std::mutex inFlightMeshUploadsMutex; + std::deque inFlightMeshUploads; + + // Enqueue mesh uploads collected on background/loading threads + void EnqueueMeshUploads(const std::vector& meshes); + bool HasPendingMeshUploads() const { + std::lock_guard lk1(pendingMeshUploadsMutex); + if (!pendingMeshUploads.empty()) return true; + std::lock_guard lk2(inFlightMeshUploadsMutex); + return !inFlightMeshUploads.empty(); + } + // Execute pending mesh uploads on the render thread (called from Render after fence wait) + void ProcessPendingMeshUploads(); + + // --- Pending entity GPU preallocation (enqueued by scene loader thread; executed on render thread) --- + std::mutex pendingEntityPreallocMutex; + std::vector pendingEntityPrealloc; + std::vector pendingInstanceBufferRecreations; + std::atomic pendingEntityPreallocQueued{false}; + + // Descriptor set layouts (declared before pools and sets) + vk::raii::DescriptorSetLayout descriptorSetLayout = nullptr; + vk::raii::DescriptorSetLayout pbrDescriptorSetLayout = nullptr; + vk::raii::DescriptorSetLayout transparentDescriptorSetLayout = nullptr; + vk::raii::PipelineLayout pbrTransparentPipelineLayout = nullptr; + + // The texture that will hold a snapshot of the opaque scene + // One off-screen color image per frame-in-flight to avoid cross-frame read/write hazards. + std::vector opaqueSceneColorImages; + std::vector> opaqueSceneColorImageAllocations; + std::vector opaqueSceneColorImageViews; + // Track the current layout per frame (initialized to eUndefined at creation) + std::vector opaqueSceneColorImageLayouts; + vk::raii::Sampler opaqueSceneColorSampler{nullptr}; + + // A descriptor set for the opaque scene color texture. One per frame in flight. + std::vector transparentDescriptorSets; + // Fallback descriptor sets for opaque pass (binds a default SHADER_READ_ONLY texture as Set 1) + std::vector transparentFallbackDescriptorSets; + + // Ray Query composite descriptor sets: sample the rayQueryOutputImage in a fullscreen pass + std::vector rqCompositeDescriptorSets; + // Fallback sampler for the RQ composite if no other sampler is available at init time + vk::raii::Sampler rqCompositeSampler{nullptr}; + + // Mesh resources + struct MeshResources { + // Device-local vertex/index buffers used for rendering + vk::raii::Buffer vertexBuffer = nullptr; + std::unique_ptr vertexBufferAllocation = nullptr; + vk::raii::Buffer indexBuffer = nullptr; + std::unique_ptr indexBufferAllocation = nullptr; + uint32_t indexCount = 0; + + // Optional per-mesh staging buffers used when uploads are batched. + // These are populated when createMeshResources(..., deferUpload=true) is used + // and are consumed and cleared by preAllocateEntityResourcesBatch(). + vk::raii::Buffer stagingVertexBuffer = nullptr; + std::unique_ptr stagingVertexBufferAllocation = nullptr; + vk::DeviceSize vertexBufferSizeBytes = 0; + + vk::raii::Buffer stagingIndexBuffer = nullptr; + std::unique_ptr stagingIndexBufferAllocation = nullptr; + vk::DeviceSize indexBufferSizeBytes = 0; + + // Material index for ray query (extracted from entity name or MaterialMesh) + int32_t materialIndex = -1; // -1 = no material/default + }; + std::unordered_map meshResources; + + // Texture resources + struct TextureResources { + vk::raii::Image textureImage = nullptr; + std::unique_ptr textureImageAllocation = nullptr; + vk::raii::ImageView textureImageView = nullptr; + vk::raii::Sampler textureSampler = nullptr; + vk::Format format = vk::Format::eR8G8B8A8Srgb; // Store texture format for proper color space handling + uint32_t mipLevels = 1; // Store number of mipmap levels + // Hint: true if source texture appears to use alpha masking (any alpha < ~1.0) + bool alphaMaskedHint = false; + }; + std::unordered_map textureResources; + + // Pending texture jobs that require GPU-side work. Worker threads + // enqueue these jobs; the main thread drains them and performs the + // actual LoadTexture/LoadTextureFromMemory calls. + struct PendingTextureJob { + enum class Type { + FromFile, + FromMemory + } type; + enum class Priority { + Critical, + NonCritical + } priority; + std::string idOrPath; + std::vector data; // only used for FromMemory + int width = 0; + int height = 0; + int channels = 0; + }; + + std::mutex pendingTextureJobsMutex; + std::condition_variable pendingTextureCv; + std::vector pendingTextureJobs; + // Track outstanding critical texture jobs (for IsLoading) + std::atomic criticalJobsOutstanding{0}; + + // Background uploader worker controls (multiple workers) + std::atomic stopUploadsWorker{false}; + std::vector uploadsWorkerThreads; + + // Track how many texture upload jobs have been scheduled vs completed + // on the GPU side. Used only for UI feedback during streaming. + std::atomic uploadJobsTotal{0}; + std::atomic uploadJobsCompleted{0}; + // When true, initial scene load has started + std::atomic initialLoadStarted{false}; + // When true, initial scene load is complete and the loading overlay should be hidden + std::atomic initialLoadComplete{false}; + std::atomic currentInternalLoadingState{InternalLoadingState::Parsing}; + // Loading-phase UI state (atomic because ImGui may query at any point) + std::atomic loadingPhase{static_cast(LoadingPhase::Scene)}; + std::atomic loadingPhaseProgress{0.0f}; + + // Performance counters for texture uploads + std::atomic bytesUploadedTotal{0}; + // Streaming window start time in nanoseconds from steady_clock epoch (0 when inactive) + std::atomic uploadWindowStartNs{0}; + // Aggregate per-texture CPU upload durations (nanoseconds) and count + std::atomic totalUploadNs{0}; + std::atomic uploadCount{0}; + + // Reverse mapping from texture ID to entities that reference it. Used to + // update descriptor sets when a streamed texture finishes uploading. + std::mutex textureUsersMutex; + std::unordered_map> textureToEntities; + + // Entities needing descriptor set refresh due to streamed textures + std::mutex dirtyEntitiesMutex; + // Map of entity -> bitmask of frames-in-flight that still need a descriptor refresh. + // This avoids the “frame 0 updated / frame 1 still default” oscillation when + // MAX_FRAMES_IN_FLIGHT > 1 and a texture becomes available mid-stream. + std::unordered_map descriptorDirtyEntities; + + // Protect concurrent access to textureResources + mutable std::shared_mutex textureResourcesMutex; + + // Protect concurrent access to entityResources + mutable std::shared_mutex entityResourcesMutex; + + // Protect concurrent access to meshResources + mutable std::shared_mutex meshResourcesMutex; + + // Texture aliasing: maps alias (canonical) IDs to actual loaded keys + std::unordered_map textureAliases; + + // Per-texture load de-duplication (serialize loads of the same texture ID only) + mutable std::mutex textureLoadStateMutex; + std::condition_variable textureLoadStateCv; + std::unordered_set texturesLoading; + + // Serialize GPU-side texture upload (image/buffer creation, transitions) to avoid driver/memory pool races + mutable std::mutex textureUploadMutex; + + // Thread pool for background background tasks (textures, etc.) + std::unique_ptr threadPool; + // Mutex to protect threadPool access during initialization/cleanup + mutable std::shared_mutex threadPoolMutex; + + // Texture loading progress (for UI) + std::atomic textureTasksScheduled{0}; + std::atomic textureTasksCompleted{0}; + std::atomic loadingFlag{false}; + + // Acceleration structure build UI progress (written on render thread). + // Kept as atomics because ImGui can query at any point during the frame. + std::atomic asBuildUiActive{false}; + std::atomic asBuildUiProgress{0.0f}; + std::atomic asBuildUiDone{0}; + std::atomic asBuildUiTotal{0}; + std::atomic asBuildUiStage{"idle"}; + std::atomic asBuildUiStartNs{0}; + + // Default texture resources (used when no texture is provided) + TextureResources defaultTextureResources; + + // Performance clamps (to reduce per-frame cost) + static constexpr uint32_t MAX_ACTIVE_LIGHTS = 1024; // Limit the number of lights processed per frame + + // Static lights loaded during model initialization + std::vector staticLights; + + // Dynamic lighting system using storage buffers + struct LightStorageBuffer { + vk::raii::Buffer buffer = nullptr; + std::unique_ptr allocation = nullptr; + void* mapped = nullptr; + size_t capacity = 0; // Current capacity in number of lights + size_t size = 0; // Current number of lights + }; + std::vector lightStorageBuffers; // One per frame in flight + + // Entity resources (contains descriptor sets - must be declared before descriptor pool) + struct EntityResources { + std::vector uniformBuffers; + std::vector> uniformBufferAllocations; + std::vector uniformBuffersMapped; + std::vector basicDescriptorSets; // For basic pipeline + std::vector pbrDescriptorSets; // For PBR pipeline + + // Tracks last updated frame for descriptor sets to avoid redundant per-frame updates. + // size = MAX_FRAMES_IN_FLIGHT. Initialized to 0xFFFFFFFFFFFFFFFF. + std::vector lastUpdatedFrameBasic; + std::vector lastUpdatedFramePBR; + + // Instance buffer for instanced rendering + vk::raii::Buffer instanceBuffer = nullptr; + std::unique_ptr instanceBufferAllocation = nullptr; + void* instanceBufferMapped = nullptr; + + // Tracks whether binding 0 (UBO) has been written at least once for each frame + // for each pipeline type. Descriptor sets for non-current frames are allocated + // but not necessarily initialized immediately (to avoid update-after-bind hazards), + // so each frame needs a one-time initialization at its safe point. + std::vector pbrUboBindingWritten; // size = MAX_FRAMES_IN_FLIGHT + std::vector basicUboBindingWritten; // size = MAX_FRAMES_IN_FLIGHT + + // Tracks whether image bindings have been written at least once for each frame. + // If false for the current frame at the safe point, we cold-initialize the + // image bindings (PBR: b1..b5 [+b6 when applicable], Basic: b1) with either + // real textures or shared defaults to avoid per-frame "black" flashes. + std::vector pbrImagesWritten; // size = MAX_FRAMES_IN_FLIGHT + std::vector basicImagesWritten; // size = MAX_FRAMES_IN_FLIGHT + + // Tracks whether the remaining required bindings in the PBR set 0 layout have + // been written at least once for each frame. + // This includes bindings like Forward+ tile buffers (7/8), reflection sampler (10), + // and TLAS (11). These bindings are required by the pipeline layout and must be + // valid before any draw that uses the PBR/glass pipelines. + std::vector pbrFixedBindingsWritten; // size = MAX_FRAMES_IN_FLIGHT + + // Cached material lookup/classification for raster rendering. + // Avoids per-frame string parsing of entity names ("_Material_") and repeated + // ModelLoader material lookups across culling, sorting, and draw loops. + bool materialCacheValid = false; + const Material* cachedMaterial = nullptr; + // Derived flags used by render queues and sorting heuristics + bool cachedIsBlended = false; + bool cachedIsGlass = false; + bool cachedIsLiquid = false; + // Material-derived push constants defaults (static per-entity unless material changes) + MaterialProperties cachedMaterialProps{}; + }; + + // Cached job for rendering a single entity in a frame + struct RenderJob + { + Entity *entity; + EntityResources *entityRes; + MeshResources *meshRes; + MeshComponent *meshComp; + TransformComponent *transformComp; + bool isAlphaMasked; + }; + std::unordered_map entityResources; + + // Descriptor pool (declared after entity resources to ensure proper destruction order) + vk::raii::DescriptorPool descriptorPool = nullptr; + + // Current frame index + uint32_t currentFrame = 0; + + // Queue family indices + QueueFamilyIndices queueFamilyIndices; + + // Validation layers + const std::vector validationLayers = { + "VK_LAYER_KHRONOS_validation" + }; + + // Required device extensions + const std::vector requiredDeviceExtensions = { + VK_KHR_SWAPCHAIN_EXTENSION_NAME + }; + + // Optional device extensions + const std::vector optionalDeviceExtensions = { + VK_KHR_DYNAMIC_RENDERING_EXTENSION_NAME, + VK_KHR_GET_PHYSICAL_DEVICE_PROPERTIES_2_EXTENSION_NAME, + VK_KHR_DEPTH_STENCIL_RESOLVE_EXTENSION_NAME, + VK_EXT_DESCRIPTOR_INDEXING_EXTENSION_NAME, + // Robustness and safety + VK_EXT_ROBUSTNESS_2_EXTENSION_NAME, + // Tile/local memory friendly dynamic rendering readback + VK_KHR_DYNAMIC_RENDERING_LOCAL_READ_EXTENSION_NAME, + // Shader tile image for fast tile access + VK_EXT_SHADER_TILE_IMAGE_EXTENSION_NAME, + // Ray query support for ray-traced rendering + VK_KHR_DEFERRED_HOST_OPERATIONS_EXTENSION_NAME, + VK_KHR_ACCELERATION_STRUCTURE_EXTENSION_NAME, + VK_KHR_RAY_QUERY_EXTENSION_NAME, + VK_EXT_HOST_IMAGE_COPY_EXTENSION_NAME, + VK_KHR_PUSH_DESCRIPTOR_EXTENSION_NAME, + VK_KHR_MAINTENANCE_5_EXTENSION_NAME, + VK_KHR_MAINTENANCE_6_EXTENSION_NAME + }; + + // All device extensions (required + optional) + std::vector deviceExtensions; + + // Initialization flag + bool initialized = false; + // Whether VK_EXT_descriptor_indexing (update-after-bind) path is enabled + bool descriptorIndexingEnabled = false; + bool storageAfterBindEnabled = false; + // Feature toggles detected/enabled at device creation + bool robustness2Enabled = false; + bool dynamicRenderingLocalReadEnabled = false; + bool shaderTileImageEnabled = false; + bool rayQueryEnabled = false; + bool accelerationStructureEnabled = false; + + // When true and current render mode is RayQuery, the engine renders a static opaque scene: + // - Animation/physics updates are suppressed by the Engine (input/Update hook) + // - TLAS refit per-frame is skipped to avoid any animation-driven changes + // - The AS is built once after loading completes + // Default now OFF so animation is enabled again for AS (per user request) + bool rayQueryStaticOnly = false; + + // (No debug-only TLAS filtering in production.) + + // Framebuffer resized flag (atomic to handle platform callback vs. render thread) + std::atomic framebufferResized{false}; + // Guard to prevent descriptor updates while a command buffer is recording + std::atomic isRecordingCmd{false}; + // Descriptor sets may be temporarily invalid during swapchain recreation; suppress updates then. + std::atomic descriptorSetsValid{true}; + // Request flag for acceleration structure build (set by loading thread, cleared by render thread) + std::atomic asBuildRequested{false}; + // Timestamp of the most recent AS build request (steady_clock ns). Used to prevent infinite deferral. + std::atomic asBuildRequestStartNs{0}; + + // Track last successfully built AS sizes to avoid rebuilding with a smaller subset + // (e.g., during incremental streaming where not all meshes are ready yet). + // We only accept AS builds that are monotonically non-decreasing in counts. + size_t lastASBuiltBLASCount = 0; + // NOTE: This is the number of renderable ENTITIES included in the AS build (not TLAS instances). + size_t lastASBuiltInstanceCount = 0; + size_t lastBuiltUniqueMeshCount = 0; + std::chrono::steady_clock::time_point lastASBuildTime = std::chrono::steady_clock::now() - std::chrono::hours(1); + // TLAS instance count (includes per-mesh instancing). Used for logging and shader bounds. + size_t lastASBuiltTlasInstanceCount = 0; + + // Freeze TLAS rebuilds after a full build to prevent regressions (e.g., animation-only TLAS) + bool asFreezeAfterFullBuild = true; // enable freezing behavior + bool asFrozen = false; // once frozen, ignore rebuilds unless explicitly overridden + // Optional developer override to allow rebuild while frozen + bool asDevOverrideAllowRebuild = false; + // Reason string for the last time a build was requested (for logging) + std::string lastASBuildRequestReason; + + // Opportunistic rebuilds (when counts increase) can cause unintended TLAS churn during animation. + // Leave this disabled by default; TLAS builds should be explicit (on mode switch / scene ready). + bool asOpportunisticRebuildEnabled = false; + + // --- AS UPDATE/Refit state --- + // Persistent TLAS instances buffer & order for UPDATE (refit) + struct TlasInstanceRef { + class Entity* entity{nullptr}; + uint32_t instanceIndex{0}; // valid only when instanced==true + bool instanced{false}; // true when this TLAS entry comes from MeshComponent instancing + }; + vk::raii::Buffer tlasInstancesBuffer{nullptr}; + std::unique_ptr tlasInstancesAllocation; + uint32_t tlasInstanceCount = 0; + std::vector tlasInstanceOrder; // order must match buffer instances + + // Scratch buffer for TLAS UPDATE operations + vk::raii::Buffer tlasUpdateScratchBuffer{nullptr}; + std::unique_ptr tlasUpdateScratchAllocation; + + // Maximum number of frames in flight + // More than 1 allows CPU/GPU overlap and reduce per-frame stalls. + // All per-frame resources (UBOs, descriptor sets, reflection RTs, etc.) + // are sized dynamically based on this value. + static constexpr uint32_t MAX_FRAMES_IN_FLIGHT = 3u; + + // --- Performance & diagnostics --- + UniformBufferObject frameUboTemplate{}; + bool enableFrustumCulling = true; + uint32_t lastCullingVisibleCount = 0; + uint32_t lastCullingCulledCount = 0; + // Distance-based LOD (projected-size skip in pixels) + bool enableDistanceLOD = true; + float lodPixelThresholdOpaque = 1.5f; + float lodPixelThresholdTransparent = 2.5f; + // Sampler anisotropy preference (clamped to device limits) + float samplerMaxAnisotropy = 8.0f; + // Upper bound on auto-generated mip levels (to avoid excessive VRAM use on huge textures) + uint32_t maxAutoGeneratedMipLevels = 4; + + // --- Planar reflections (scaffolding) --- + bool enablePlanarReflections = false; // UI toggle to enable/disable planar reflections + float reflectionResolutionScale = 0.5f; // Scale relative to swapchain size + // Cached per-frame reflection data used by UBO population + // Current frame's reflection VP (for rendering the reflection pass) + glm::mat4 currentReflectionVP{1.0f}; + glm::vec4 currentReflectionPlane{0.0f, 1.0f, 0.0f, 0.0f}; + // Per-frame stored reflection VP (written during reflection pass) + std::vector reflectionVPs; // size MAX_FRAMES_IN_FLIGHT + // The VP to sample in the main pass (prev-frame VP to match prev-frame texture) + glm::mat4 sampleReflectionVP{1.0f}; + bool reflectionResourcesDirty = false; // recreate reflection RTs at safe point + + // --- Ray query rendering options --- + bool enableRayQueryReflections = true; // UI toggle to enable reflections in ray query mode + bool enableRayQueryTransparency = true; // UI toggle to enable transparency/refraction in ray query mode + + // === Watchdog system to detect application hangs === + // Atomic timestamp updated every frame - watchdog thread checks if stale + std::atomic lastFrameUpdateTime; + // Low-noise progress marker to pinpoint where the render thread stalled when the watchdog fires + std::atomic watchdogProgressLabel{"init"}; + // Optional numeric marker to help pinpoint stalls inside large loops + std::atomic watchdogProgressIndex{0}; + std::thread watchdogThread; + std::atomic watchdogRunning{false}; + // Some operations (notably BLAS/TLAS builds in Debug on large scenes) can legitimately take + // longer than the watchdog threshold. When set, the watchdog will not abort. + std::atomic watchdogSuppressed{false}; + + // === Descriptor update deferral while recording === + struct PendingDescOp { + Entity* entity; + std::string texPath; + bool usePBR; + uint32_t frameIndex; + bool imagesOnly; + }; + std::mutex pendingDescMutex; + std::vector pendingDescOps; // flushed at frame safe point + std::atomic descriptorRefreshPending{false}; + + struct ReflectionRT { + vk::raii::Image color{nullptr}; + std::unique_ptr colorAlloc{nullptr}; + vk::raii::ImageView colorView{nullptr}; + vk::raii::Sampler colorSampler{nullptr}; + + vk::raii::Image depth{nullptr}; + std::unique_ptr depthAlloc{nullptr}; + vk::raii::ImageView depthView{nullptr}; + + uint32_t width{0}; + uint32_t height{0}; + }; + std::vector reflections; // one per frame-in-flight + + // Private methods + bool createInstance(const std::string& appName, bool enableValidationLayers, bool debugSync); + bool setupDebugMessenger(bool enableValidationLayers); + bool createSurface(); + bool checkValidationLayerSupport() const; + bool pickPhysicalDevice(); + void addSupportedOptionalExtensions(); + bool createLogicalDevice(bool enableValidationLayers); + bool createSwapChain(); + bool createImageViews(); + bool setupDynamicRendering(); + bool createDescriptorSetLayout(); + bool createPBRDescriptorSetLayout(); + bool createGraphicsPipeline(); + + bool createPBRPipeline(); + bool createLightingPipeline(); + bool createDepthPrepassPipeline(); + bool createForwardPlusPipelinesAndResources(); + + // Ray query pipeline creation + bool createRayQueryDescriptorSetLayout(); + bool createRayQueryPipeline(); + bool createRayQueryResources(); + // If updateOnlyCurrentFrame is true, only descriptor sets for currentFrame will be updated. + // Use updateOnlyCurrentFrame=false during initialization/swapchain recreation when the device is idle. + bool createOrResizeForwardPlusBuffers(uint32_t tilesX, uint32_t tilesY, uint32_t slicesZ, bool updateOnlyCurrentFrame = false); + void updateForwardPlusParams(uint32_t frameIndex, const glm::mat4& view, const glm::mat4& proj, uint32_t lightCount, uint32_t tilesX, uint32_t tilesY, uint32_t slicesZ, float nearZ, float farZ); + void dispatchForwardPlus(vk::raii::CommandBuffer& cmd, uint32_t tilesX, uint32_t tilesY, uint32_t slicesZ); + // Ensure Forward+ compute descriptor set binding 0 (lights SSBO) is bound for a frame + void refreshForwardPlusComputeLightsBindingForFrame(uint32_t frameIndex); + bool createComputePipeline(); + void pushMaterialProperties(vk::CommandBuffer commandBuffer, const MaterialProperties& material) const; + bool createCommandPool(); + + // Shadow mapping methods + bool createComputeCommandPool(); + bool createDepthResources(); + bool createTextureImage(const std::string& texturePath, TextureResources& resources); + bool createTextureImageView(TextureResources& resources); + bool createTextureSampler(TextureResources& resources); + bool createDefaultTextureResources(); + bool createSharedDefaultPBRTextures(); + bool createMeshResources(MeshComponent* meshComponent, bool deferUpload = false); + bool createUniformBuffers(Entity* entity); + bool createDescriptorPool(); + bool createDescriptorSets(Entity* entity, const std::string& texturePath, bool usePBR = false); + bool createDescriptorSets(Entity *entity, EntityResources &res, const std::string &texturePath, bool usePBR = false); + bool updateDescriptorSetsForFrame(Entity *entity, + const std::string &texturePath, + bool usePBR, + uint32_t frameIndex, + bool imagesOnly = false, + bool uboOnly = false); + bool updateDescriptorSetsForFrame(Entity *entity, + EntityResources &res, + const std::string &texturePath, + bool usePBR, + uint32_t frameIndex, + bool imagesOnly = false, + bool uboOnly = false); + // Refresh only the currentFrame PBR descriptor set bindings that Forward+ relies on + // (b6 = lights SSBO, b7 = tile headers, b8 = tile indices). Safe to call after + // we've waited on the frame fence at the start of Render(). + void refreshPBRForwardPlusBindingsForFrame(uint32_t frameIndex); + bool createCommandBuffers(); + bool createSyncObjects(); + + void cleanupSwapChain(); + + // Planar reflection helpers (initial scaffolding) + bool createReflectionResources(uint32_t width, uint32_t height); + void destroyReflectionResources(); + // Render the scene into the reflection RT (mirrored about a plane) — to be fleshed out next step + void renderReflectionPass(vk::raii::CommandBuffer& cmd, + const glm::vec4& planeWS, + CameraComponent* camera, + const std::vector &jobs); + + // Ensure Vulkan-Hpp dispatcher is initialized for the current thread when using RAII objects on worker threads + void ensureThreadLocalVulkanInit() const; + + // Cache and classify an entity's material for raster rendering (opaque vs blended, glass/liquid flags, + // and push-constant defaults). This avoids repeated per-frame string parsing and material lookups. + void ensureEntityMaterialCache(Entity* entity, EntityResources &res); + + // ===================== Culling helpers ===================== + struct FrustumPlanes { + // Plane equation ax + by + cz + d >= 0 considered inside + glm::vec4 planes[6]{}; // 0=L,1=R,2=B,3=T,4=N,5=F + }; + + static FrustumPlanes extractFrustumPlanes(const glm::mat4& vp); + + static void transformAABB(const glm::mat4& M, + const glm::vec3& localMin, + const glm::vec3& localMax, + glm::vec3& outMin, + glm::vec3& outMax); + + static bool aabbIntersectsFrustum(const glm::vec3& worldMin, + const glm::vec3& worldMax, + const FrustumPlanes& frustum); + void recreateSwapChain(); + + void updateUniformBuffer(uint32_t currentImage, Entity* entity, EntityResources *entityRes, CameraComponent* camera, TransformComponent *tc = nullptr); + void updateUniformBuffer(uint32_t currentImage, Entity* entity, EntityResources *entityRes, CameraComponent* camera, const glm::mat4& customTransform); + void updateUniformBufferInternal(uint32_t currentImage, Entity* entity, EntityResources *entityRes, CameraComponent* camera, UniformBufferObject& ubo); + void prepareFrameUboTemplate(CameraComponent *camera); + + vk::raii::ShaderModule createShaderModule(const std::vector& code); + + QueueFamilyIndices findQueueFamilies(const vk::raii::PhysicalDevice& device); + SwapChainSupportDetails querySwapChainSupport(const vk::raii::PhysicalDevice& device); + bool isDeviceSuitable(vk::raii::PhysicalDevice& device); + bool checkDeviceExtensionSupport(vk::raii::PhysicalDevice& device); + + vk::SurfaceFormatKHR chooseSwapSurfaceFormat(const std::vector& availableFormats); + vk::PresentModeKHR chooseSwapPresentMode(const std::vector& availablePresentModes); + vk::Extent2D chooseSwapExtent(const vk::SurfaceCapabilitiesKHR& capabilities); + + uint32_t findMemoryType(uint32_t typeFilter, vk::MemoryPropertyFlags properties) const; + + std::pair createBuffer(vk::DeviceSize size, vk::BufferUsageFlags usage, vk::MemoryPropertyFlags properties); + bool createOpaqueSceneColorResources(); + void createTransparentDescriptorSets(); + void createTransparentFallbackDescriptorSets(); + std::pair> createBufferPooled(vk::DeviceSize size, vk::BufferUsageFlags usage, vk::MemoryPropertyFlags properties); + void copyBuffer(vk::raii::Buffer& srcBuffer, vk::raii::Buffer& dstBuffer, vk::DeviceSize size); + + std::pair createImage(uint32_t width, uint32_t height, vk::Format format, vk::ImageTiling tiling, vk::ImageUsageFlags usage, vk::MemoryPropertyFlags properties); + std::pair> createImagePooled(uint32_t width, uint32_t height, vk::Format format, vk::ImageTiling tiling, vk::ImageUsageFlags usage, vk::MemoryPropertyFlags properties, uint32_t mipLevels = 1, vk::SharingMode sharingMode = vk::SharingMode::eExclusive, const std::vector& queueFamilies = {}); + void transitionImageLayout(vk::Image image, vk::Format format, vk::ImageLayout oldLayout, vk::ImageLayout newLayout, uint32_t mipLevels = 1); + void copyBufferToImage(vk::Buffer buffer, vk::Image image, uint32_t width, uint32_t height, vk::ArrayProxy regions); + // Extended: track stagedBytes for perf stats + void uploadImageFromStaging(vk::Buffer staging, + vk::Image image, + vk::Format format, + vk::ArrayProxy regions, + uint32_t mipLevels, + vk::DeviceSize stagedBytes); + + vk::raii::ImageView createImageView(vk::raii::Image& image, vk::Format format, vk::ImageAspectFlags aspectFlags, uint32_t mipLevels = 1); + vk::Format findSupportedFormat(const std::vector& candidates, vk::ImageTiling tiling, vk::FormatFeatureFlags features); + bool hasStencilComponent(vk::Format format); + + // Background uploader helpers + void StartUploadsWorker(size_t workerCount = 0); + void StopUploadsWorker(); + + // Serialize descriptor writes vs command buffer recording to avoid mid-record updates during recording + std::mutex renderRecordMutex; + + // (Descriptor API wrappers were considered but avoided here to keep RAII types intact.) + + // Upload perf getters + public: + std::string ResolvePath(const std::string& filename) const; + std::vector readFile(const std::string& filename); + + uint64_t GetBytesUploadedTotal() const { + return bytesUploadedTotal.load(std::memory_order_relaxed); + } + double GetAverageUploadMs() const { + uint64_t ns = totalUploadNs.load(std::memory_order_relaxed); + uint32_t cnt = uploadCount.load(std::memory_order_relaxed); + if (cnt == 0) + return 0.0; + return static_cast(ns) / 1e6 / static_cast(cnt); + } + double GetUploadThroughputMBps() const { + uint64_t startNs = uploadWindowStartNs.load(std::memory_order_relaxed); + if (startNs == 0) + return 0.0; + auto now = std::chrono::steady_clock::now().time_since_epoch(); + uint64_t nowNs = static_cast(std::chrono::duration_cast(now).count()); + if (nowNs <= startNs) + return 0.0; + double seconds = static_cast(nowNs - startNs) / 1e9; + double mb = static_cast(bytesUploadedTotal.load(std::memory_order_relaxed)) / (1024.0 * 1024.0); + return seconds > 0.0 ? (mb / seconds) : 0.0; + } +}; diff --git a/attachments/sync2_engine/renderer_compute.cpp b/attachments/sync2_engine/renderer_compute.cpp new file mode 100644 index 00000000..ff4c1563 --- /dev/null +++ b/attachments/sync2_engine/renderer_compute.cpp @@ -0,0 +1,556 @@ +/* Copyright (c) 2025 Holochip Corporation + * + * SPDX-License-Identifier: Apache-2.0 + * + * Licensed under the Apache License, Version 2.0 the "License"; + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +#include "renderer.h" +#include +#include +#include + +// This file contains compute-related methods from the Renderer class + +// Create compute pipeline +bool Renderer::createComputePipeline() { + try { + // Read compute shader code + auto computeShaderCode = readFile("shaders/hrtf.spv"); + + // Create shader module + vk::raii::ShaderModule computeShaderModule = createShaderModule(computeShaderCode); + + // Create shader stage info + vk::PipelineShaderStageCreateInfo computeShaderStageInfo{ + .stage = vk::ShaderStageFlagBits::eCompute, + .module = *computeShaderModule, + .pName = "main" + }; + + // Create compute descriptor set layout + std::array computeBindings = { + vk::DescriptorSetLayoutBinding{ + .binding = 0, + .descriptorType = vk::DescriptorType::eStorageBuffer, + .descriptorCount = 1, + .stageFlags = vk::ShaderStageFlagBits::eCompute, + .pImmutableSamplers = nullptr + }, + vk::DescriptorSetLayoutBinding{ + .binding = 1, + .descriptorType = vk::DescriptorType::eStorageBuffer, + .descriptorCount = 1, + .stageFlags = vk::ShaderStageFlagBits::eCompute, + .pImmutableSamplers = nullptr + }, + vk::DescriptorSetLayoutBinding{ + .binding = 2, + .descriptorType = vk::DescriptorType::eStorageBuffer, + .descriptorCount = 1, + .stageFlags = vk::ShaderStageFlagBits::eCompute, + .pImmutableSamplers = nullptr + }, + vk::DescriptorSetLayoutBinding{ + .binding = 3, + .descriptorType = vk::DescriptorType::eUniformBuffer, + .descriptorCount = 1, + .stageFlags = vk::ShaderStageFlagBits::eCompute, + .pImmutableSamplers = nullptr + } + }; + + vk::DescriptorSetLayoutCreateInfo computeLayoutInfo{ + .bindingCount = static_cast(computeBindings.size()), + .pBindings = computeBindings.data() + }; + + computeDescriptorSetLayout = vk::raii::DescriptorSetLayout(device, computeLayoutInfo); + + // Create compute pipeline layout + vk::PipelineLayoutCreateInfo pipelineLayoutInfo{ + .setLayoutCount = 1, + .pSetLayouts = &*computeDescriptorSetLayout, + .pushConstantRangeCount = 0, + .pPushConstantRanges = nullptr + }; + + computePipelineLayout = vk::raii::PipelineLayout(device, pipelineLayoutInfo); + + // Create compute pipeline + vk::ComputePipelineCreateInfo pipelineInfo{ + .stage = computeShaderStageInfo, + .layout = *computePipelineLayout + }; + + computePipeline = vk::raii::Pipeline(device, nullptr, pipelineInfo); + + // Create compute descriptor pool + std::array poolSizes = { + vk::DescriptorPoolSize{ + .type = vk::DescriptorType::eStorageBuffer, + .descriptorCount = 6u * MAX_FRAMES_IN_FLIGHT // room for multiple compute pipelines + }, + vk::DescriptorPoolSize{ + .type = vk::DescriptorType::eUniformBuffer, + .descriptorCount = 2u * MAX_FRAMES_IN_FLIGHT + } + }; + + vk::DescriptorPoolCreateInfo poolInfo{ + .flags = vk::DescriptorPoolCreateFlagBits::eFreeDescriptorSet, + .maxSets = 2u * MAX_FRAMES_IN_FLIGHT, + .poolSizeCount = static_cast(poolSizes.size()), + .pPoolSizes = poolSizes.data() + }; + + computeDescriptorPool = vk::raii::DescriptorPool(device, poolInfo); + + return createComputeCommandPool(); + } catch (const std::exception& e) { + std::cerr << "Failed to create compute pipeline: " << e.what() << std::endl; + return false; + } +} + +// Forward+ compute (tiled light culling) +bool Renderer::createForwardPlusPipelinesAndResources() { + try { + // Load compute shader + auto cullSpv = readFile("shaders/forward_plus_cull.spv"); + vk::raii::ShaderModule cullModule = createShaderModule(cullSpv); + + // Descriptor set layout: 0=lights SSBO (RO), 1=tile headers SSBO (RW), 2=tile indices SSBO (RW), 3=params UBO (RO) + std::array bindings = { + vk::DescriptorSetLayoutBinding{.binding = 0, .descriptorType = vk::DescriptorType::eStorageBuffer, .descriptorCount = 1, .stageFlags = vk::ShaderStageFlagBits::eCompute}, + vk::DescriptorSetLayoutBinding{.binding = 1, .descriptorType = vk::DescriptorType::eStorageBuffer, .descriptorCount = 1, .stageFlags = vk::ShaderStageFlagBits::eCompute}, + vk::DescriptorSetLayoutBinding{.binding = 2, .descriptorType = vk::DescriptorType::eStorageBuffer, .descriptorCount = 1, .stageFlags = vk::ShaderStageFlagBits::eCompute}, + vk::DescriptorSetLayoutBinding{.binding = 3, .descriptorType = vk::DescriptorType::eUniformBuffer, .descriptorCount = 1, .stageFlags = vk::ShaderStageFlagBits::eCompute} + }; + + vk::DescriptorSetLayoutCreateInfo layoutInfo{.bindingCount = static_cast(bindings.size()), .pBindings = bindings.data()}; + forwardPlusDescriptorSetLayout = vk::raii::DescriptorSetLayout(device, layoutInfo); + + // Pipeline layout + vk::PipelineLayoutCreateInfo plInfo{.setLayoutCount = 1, .pSetLayouts = &*forwardPlusDescriptorSetLayout}; + forwardPlusPipelineLayout = vk::raii::PipelineLayout(device, plInfo); + + // Pipeline + vk::PipelineShaderStageCreateInfo stage{.stage = vk::ShaderStageFlagBits::eCompute, .module = *cullModule, .pName = "main"}; + vk::ComputePipelineCreateInfo cpInfo{.stage = stage, .layout = *forwardPlusPipelineLayout}; + forwardPlusPipeline = vk::raii::Pipeline(device, nullptr, cpInfo); + + // Allocate per-frame structs + forwardPlusPerFrame.resize(MAX_FRAMES_IN_FLIGHT); + + // Allocate compute descriptor sets (reuse computeDescriptorPool) + std::vector layouts(MAX_FRAMES_IN_FLIGHT, *forwardPlusDescriptorSetLayout); + vk::DescriptorSetAllocateInfo allocInfo{.descriptorPool = *computeDescriptorPool, .descriptorSetCount = MAX_FRAMES_IN_FLIGHT, .pSetLayouts = layouts.data()}; + auto sets = vk::raii::DescriptorSets(device, allocInfo); + for (size_t i = 0; i < MAX_FRAMES_IN_FLIGHT; ++i) { + forwardPlusPerFrame[i].computeSet = std::move(sets[i]); + } + + // Initial buffer allocation based on current swapchain extent (also updates descriptors) + uint32_t tilesX = (swapChainExtent.width + forwardPlusTileSizeX - 1) / forwardPlusTileSizeX; + uint32_t tilesY = (swapChainExtent.height + forwardPlusTileSizeY - 1) / forwardPlusTileSizeY; + if (!createOrResizeForwardPlusBuffers(tilesX, tilesY, forwardPlusSlicesZ)) { + return false; + } + + return true; + } catch (const std::exception& e) { + std::cerr << "Failed to create Forward+ compute resources: " << e.what() << std::endl; + return false; + } +} + +bool Renderer::createOrResizeForwardPlusBuffers(uint32_t tilesX, uint32_t tilesY, uint32_t slicesZ, bool updateOnlyCurrentFrame) { + try { + size_t clusters = static_cast(tilesX) * static_cast(tilesY) * static_cast(slicesZ); + size_t indices = clusters * static_cast(MAX_LIGHTS_PER_TILE); + + // Range of frames to touch this call + size_t beginFrame = 0; + size_t endFrame = MAX_FRAMES_IN_FLIGHT; + if (updateOnlyCurrentFrame) { + beginFrame = static_cast(currentFrame); + endFrame = beginFrame + 1; + } + + for (size_t i = beginFrame; i < endFrame; ++i) { + auto& f = forwardPlusPerFrame[i]; + bool needTiles = (f.tilesCapacity < clusters) || (!*f.tileHeaders); + bool needIdx = (f.indicesCapacity < indices) || (!*f.tileLightIndices); + + if (needTiles) { + if (!!*f.tileHeaders) { + f.tileHeaders = vk::raii::Buffer(nullptr); + f.tileHeadersAlloc.reset(); + } + auto [buf, alloc] = createBufferPooled(clusters * sizeof(TileHeader), vk::BufferUsageFlagBits::eStorageBuffer, vk::MemoryPropertyFlagBits::eDeviceLocal | vk::MemoryPropertyFlagBits::eHostVisible | vk::MemoryPropertyFlagBits::eHostCoherent); + f.tileHeaders = std::move(buf); + f.tileHeadersAlloc = std::move(alloc); + f.tilesCapacity = clusters; + // Initialize headers to zero so that count==0 when Forward+ is disabled or before first dispatch + if (!!f.tileHeadersAlloc && f.tileHeadersAlloc->mappedPtr) { + std::memset(f.tileHeadersAlloc->mappedPtr, 0, clusters * sizeof(TileHeader)); + } + } + if (needIdx) { + if (!!*f.tileLightIndices) { + f.tileLightIndices = vk::raii::Buffer(nullptr); + f.tileLightIndicesAlloc.reset(); + } + auto [buf, alloc] = createBufferPooled(indices * sizeof(uint32_t), vk::BufferUsageFlagBits::eStorageBuffer, vk::MemoryPropertyFlagBits::eDeviceLocal | vk::MemoryPropertyFlagBits::eHostVisible | vk::MemoryPropertyFlagBits::eHostCoherent); + f.tileLightIndices = std::move(buf); + f.tileLightIndicesAlloc = std::move(alloc); + f.indicesCapacity = indices; + // Initialize indices to zero to avoid stray reads + if (!!f.tileLightIndicesAlloc && f.tileLightIndicesAlloc->mappedPtr) { + std::memset(f.tileLightIndicesAlloc->mappedPtr, 0, indices * sizeof(uint32_t)); + } + } + if (!*f.params) { + auto [pbuf, palloc] = createBufferPooled(sizeof(glm::mat4) * 2 + sizeof(glm::vec4) * 3, vk::BufferUsageFlagBits::eUniformBuffer, vk::MemoryPropertyFlagBits::eHostVisible | vk::MemoryPropertyFlagBits::eHostCoherent); + f.params = std::move(pbuf); + f.paramsAlloc = std::move(palloc); + f.paramsMapped = f.paramsAlloc->mappedPtr; + } + + // Update compute descriptor set writes for this frame (only if buffers changed or first time) + if (!!*forwardPlusPerFrame[i].computeSet) { + if (!descriptorSetsValid.load(std::memory_order_relaxed)) { + // Descriptor sets are being recreated; skip writes this iteration + continue; + } + if (isRecordingCmd.load(std::memory_order_relaxed)) { + // Avoid update-after-bind while a command buffer is recording + continue; + } + // Only update descriptors if we resized or created any buffer this iteration + if (needTiles || needIdx || !!*f.params) { + // Build writes conditionally to avoid dereferencing uninitialized light buffers + std::vector writes; + + // Binding 0: lights SSBO (only if available) + bool haveLightBuffer = (i < lightStorageBuffers.size()) && !!*lightStorageBuffers[i].buffer; + vk::DescriptorBufferInfo lightsInfo{}; + if (haveLightBuffer) { + lightsInfo = vk::DescriptorBufferInfo{.buffer = *lightStorageBuffers[i].buffer, .offset = 0, .range = VK_WHOLE_SIZE}; + writes.push_back(vk::WriteDescriptorSet{.dstSet = *forwardPlusPerFrame[i].computeSet, .dstBinding = 0, .descriptorCount = 1, .descriptorType = vk::DescriptorType::eStorageBuffer, .pBufferInfo = &lightsInfo}); + } + + // Binding 1: tile headers + vk::DescriptorBufferInfo headersInfo{.buffer = *f.tileHeaders, .offset = 0, .range = VK_WHOLE_SIZE}; + writes.push_back(vk::WriteDescriptorSet{.dstSet = *forwardPlusPerFrame[i].computeSet, .dstBinding = 1, .descriptorCount = 1, .descriptorType = vk::DescriptorType::eStorageBuffer, .pBufferInfo = &headersInfo}); + + // Binding 2: tile indices + vk::DescriptorBufferInfo indicesInfo{.buffer = *f.tileLightIndices, .offset = 0, .range = VK_WHOLE_SIZE}; + writes.push_back(vk::WriteDescriptorSet{.dstSet = *forwardPlusPerFrame[i].computeSet, .dstBinding = 2, .descriptorCount = 1, .descriptorType = vk::DescriptorType::eStorageBuffer, .pBufferInfo = &indicesInfo}); + + // Binding 3: params UBO + vk::DescriptorBufferInfo paramsInfo{.buffer = *f.params, .offset = 0, .range = VK_WHOLE_SIZE}; + writes.push_back(vk::WriteDescriptorSet{.dstSet = *forwardPlusPerFrame[i].computeSet, .dstBinding = 3, .descriptorCount = 1, .descriptorType = vk::DescriptorType::eUniformBuffer, .pBufferInfo = ¶msInfo}); + + if (!writes.empty()) { + std::lock_guard lk(descriptorMutex); + device.updateDescriptorSets(writes, {}); + } + } + } + } + + // Update PBR descriptor sets to bind new tile buffers for forward shading. + // Avoid updating sets that may be in use by in-flight command buffers. + // If updateOnlyCurrentFrame=true, only update the current frame's sets (safe point after fence wait). + try { + // Only update PBR descriptor sets for bindings 7/8 in two situations: + // - When called in initialization/device-idle paths (updateOnlyCurrentFrame=false), or + // - When this call resulted in (re)creating the buffers for the current frame + size_t beginFrameSets = 0; + size_t endFrameSets = forwardPlusPerFrame.size(); + if (updateOnlyCurrentFrame) { + beginFrameSets = static_cast(currentFrame); + endFrameSets = beginFrameSets + 1; + } + + for (auto& kv : entityResources) { + auto& resources = kv.second; + if (resources.pbrDescriptorSets.empty()) + continue; + for (size_t i = beginFrameSets; i < endFrameSets && i < resources.pbrDescriptorSets.size() && i < forwardPlusPerFrame.size(); ++i) { + if (!descriptorSetsValid.load(std::memory_order_relaxed)) + continue; + if (isRecordingCmd.load(std::memory_order_relaxed)) + continue; + if (!(*resources.pbrDescriptorSets[i])) + continue; + auto& f = forwardPlusPerFrame[i]; + if (!*f.tileHeaders || !*f.tileLightIndices) + continue; + vk::DescriptorBufferInfo headersInfo{.buffer = *f.tileHeaders, .offset = 0, .range = VK_WHOLE_SIZE}; + vk::DescriptorBufferInfo indicesInfo{.buffer = *f.tileLightIndices, .offset = 0, .range = VK_WHOLE_SIZE}; + std::array writes = { + vk::WriteDescriptorSet{.dstSet = *resources.pbrDescriptorSets[i], .dstBinding = 7, .descriptorCount = 1, .descriptorType = vk::DescriptorType::eStorageBuffer, .pBufferInfo = &headersInfo}, + vk::WriteDescriptorSet{.dstSet = *resources.pbrDescriptorSets[i], .dstBinding = 8, .dstArrayElement = 0, .descriptorCount = 1, .descriptorType = vk::DescriptorType::eStorageBuffer, .pBufferInfo = &indicesInfo} + }; { + std::lock_guard lk(descriptorMutex); + device.updateDescriptorSets(writes, {}); + } + } + } + } catch (...) { + } + + return true; + } catch (const std::exception& e) { + std::cerr << "Failed to create/resize Forward+ buffers: " << e.what() << std::endl; + return false; + } +} + +void Renderer::updateForwardPlusParams(uint32_t frameIndex, const glm::mat4& view, const glm::mat4& proj, uint32_t lightCount, uint32_t tilesX, uint32_t tilesY, uint32_t slicesZ, float nearZ, float farZ) { + if (frameIndex >= forwardPlusPerFrame.size()) + return; + auto& f = forwardPlusPerFrame[frameIndex]; + if (!f.paramsMapped) + return; + + // Pack: [view][proj][screen xy, tile xy][lightCount, maxPerTile, tilesX, tilesY][near, far, slicesZ, 0] + struct ParamsCPU { + glm::mat4 view; + glm::mat4 proj; + glm::vec4 screenTile; // x=width,y=height,z=tileX,w=tileY + glm::uvec4 counts; // x=lightCount,y=maxPerTile,z=tilesX,w=tilesY + glm::vec4 zParams; // x=nearZ,y=farZ,z=slicesZ,w=0 + }; + + ParamsCPU p{}; + p.view = view; + p.proj = proj; + p.screenTile = glm::vec4(static_cast(swapChainExtent.width), static_cast(swapChainExtent.height), static_cast(forwardPlusTileSizeX), static_cast(forwardPlusTileSizeY)); + p.counts = glm::uvec4(lightCount, MAX_LIGHTS_PER_TILE, tilesX, tilesY); + p.zParams = glm::vec4(nearZ, farZ, static_cast(slicesZ), 0.0f); + + std::memcpy(f.paramsAlloc->mappedPtr, &p, sizeof(ParamsCPU)); +} + +void Renderer::dispatchForwardPlus(vk::raii::CommandBuffer& cmd, uint32_t tilesX, uint32_t tilesY, uint32_t slicesZ) { + if (!*forwardPlusPipeline) + return; + if (currentFrame >= forwardPlusPerFrame.size()) + return; + auto& f = forwardPlusPerFrame[currentFrame]; + if (!*f.computeSet) + return; + + // Ensure a valid lights buffer is bound; otherwise skip compute this frame + bool haveLightBuffer = (currentFrame < lightStorageBuffers.size()) && !!*lightStorageBuffers[currentFrame].buffer; + if (!haveLightBuffer) + return; + + cmd.bindPipeline(vk::PipelineBindPoint::eCompute, *forwardPlusPipeline); + vk::DescriptorSet set = *f.computeSet; + cmd.bindDescriptorSets(vk::PipelineBindPoint::eCompute, *forwardPlusPipelineLayout, 0, set, {}); + // One invocation per cluster (X,Y by workgroup grid, Z as third dimension) + cmd.dispatch(tilesX, tilesY, slicesZ); + // Make tilelist writes visible to fragment shader (Sync2) + vk::MemoryBarrier2 memBarrier2{ + .srcStageMask = vk::PipelineStageFlagBits2::eComputeShader, + .srcAccessMask = vk::AccessFlagBits2::eShaderWrite, + .dstStageMask = vk::PipelineStageFlagBits2::eFragmentShader, + .dstAccessMask = vk::AccessFlagBits2::eShaderRead + }; + vk::DependencyInfo depInfoComputeToFrag{.memoryBarrierCount = 1, .pMemoryBarriers = &memBarrier2}; + cmd.pipelineBarrier2(depInfoComputeToFrag); +} + +// Ensure compute descriptor binding 0 (lights SSBO) is bound for the given frame. +void Renderer::refreshForwardPlusComputeLightsBindingForFrame(uint32_t frameIndex) { + try { + if (frameIndex >= forwardPlusPerFrame.size()) + return; + if (!*forwardPlusPerFrame[frameIndex].computeSet) + return; + if (frameIndex >= lightStorageBuffers.size()) + return; + if (!*lightStorageBuffers[frameIndex].buffer) + return; + + // Updating descriptor sets during recording causes validation errors: + // "commandBuffer must be in the recording state" and invalidates the command buffer. + // These descriptor sets are already initialized earlier at the safe point (line 1059), + // so this redundant update during recording is unnecessary and harmful. + if (isRecordingCmd.load(std::memory_order_relaxed)) { + return; // Skip update, descriptor is already valid from earlier initialization + } + + vk::DescriptorBufferInfo lightsInfo{.buffer = *lightStorageBuffers[frameIndex].buffer, .offset = 0, .range = VK_WHOLE_SIZE}; + vk::WriteDescriptorSet write{.dstSet = *forwardPlusPerFrame[frameIndex].computeSet, .dstBinding = 0, .dstArrayElement = 0, .descriptorCount = 1, .descriptorType = vk::DescriptorType::eStorageBuffer, .pBufferInfo = &lightsInfo}; { + std::lock_guard lk(descriptorMutex); + device.updateDescriptorSets(write, {}); + } + } catch (const std::exception& e) { + std::cerr << "Failed to refresh Forward+ compute lights binding for frame " << frameIndex << ": " << e.what() << std::endl; + } +} + +// Create compute command pool +bool Renderer::createComputeCommandPool() { + try { + vk::CommandPoolCreateInfo poolInfo{ + .flags = vk::CommandPoolCreateFlagBits::eResetCommandBuffer, + .queueFamilyIndex = queueFamilyIndices.computeFamily.value() + }; + + computeCommandPool = vk::raii::CommandPool(device, poolInfo); + return true; + } catch (const std::exception& e) { + std::cerr << "Failed to create compute command pool: " << e.what() << std::endl; + return false; + } +} + +// Dispatch compute shader +vk::raii::Fence Renderer::DispatchCompute(uint32_t groupCountX, + uint32_t groupCountY, + uint32_t groupCountZ, + vk::Buffer inputBuffer, + vk::Buffer outputBuffer, + vk::Buffer hrtfBuffer, + vk::Buffer paramsBuffer) { + try { + // Create fence for synchronization + vk::FenceCreateInfo fenceInfo{}; + vk::raii::Fence computeFence(device, fenceInfo); + + // Create descriptor sets + vk::DescriptorSetAllocateInfo allocInfo{ + .descriptorPool = *computeDescriptorPool, + .descriptorSetCount = 1, + .pSetLayouts = &*computeDescriptorSetLayout + }; { + std::lock_guard lk(descriptorMutex); + computeDescriptorSets = device.allocateDescriptorSets(allocInfo); + } + + // Update descriptor sets + vk::DescriptorBufferInfo inputBufferInfo{ + .buffer = inputBuffer, + .offset = 0, + .range = VK_WHOLE_SIZE + }; + + vk::DescriptorBufferInfo outputBufferInfo{ + .buffer = outputBuffer, + .offset = 0, + .range = VK_WHOLE_SIZE + }; + + vk::DescriptorBufferInfo hrtfBufferInfo{ + .buffer = hrtfBuffer, + .offset = 0, + .range = VK_WHOLE_SIZE + }; + + vk::DescriptorBufferInfo paramsBufferInfo{ + .buffer = paramsBuffer, + .offset = 0, + .range = VK_WHOLE_SIZE + }; + + std::array descriptorWrites = { + vk::WriteDescriptorSet{ + .dstSet = *computeDescriptorSets[0], + .dstBinding = 0, + .dstArrayElement = 0, + .descriptorCount = 1, + .descriptorType = vk::DescriptorType::eStorageBuffer, + .pBufferInfo = &inputBufferInfo + }, + vk::WriteDescriptorSet{ + .dstSet = *computeDescriptorSets[0], + .dstBinding = 1, + .dstArrayElement = 0, + .descriptorCount = 1, + .descriptorType = vk::DescriptorType::eStorageBuffer, + .pBufferInfo = &outputBufferInfo + }, + vk::WriteDescriptorSet{ + .dstSet = *computeDescriptorSets[0], + .dstBinding = 2, + .dstArrayElement = 0, + .descriptorCount = 1, + .descriptorType = vk::DescriptorType::eStorageBuffer, + .pBufferInfo = &hrtfBufferInfo + }, + vk::WriteDescriptorSet{ + .dstSet = *computeDescriptorSets[0], + .dstBinding = 3, + .dstArrayElement = 0, + .descriptorCount = 1, + .descriptorType = vk::DescriptorType::eUniformBuffer, + .pBufferInfo = ¶msBufferInfo + } + }; { + std::lock_guard lk(descriptorMutex); + device.updateDescriptorSets(descriptorWrites, {}); + } + + // Create command buffer using dedicated compute command pool + vk::CommandBufferAllocateInfo cmdAllocInfo{ + .commandPool = *computeCommandPool, + .level = vk::CommandBufferLevel::ePrimary, + .commandBufferCount = 1 + }; + + auto commandBuffers = device.allocateCommandBuffers(cmdAllocInfo); + // Use RAII wrapper temporarily for recording to preserve dispatch loader + vk::raii::CommandBuffer commandBufferRaii = std::move(commandBuffers[0]); + + // Begin command buffer + vk::CommandBufferBeginInfo beginInfo{ + .flags = vk::CommandBufferUsageFlagBits::eOneTimeSubmit + }; + + commandBufferRaii.begin(beginInfo); + + // Bind compute pipeline + commandBufferRaii.bindPipeline(vk::PipelineBindPoint::eCompute, *computePipeline); + + // Bind descriptor set + commandBufferRaii.bindDescriptorSets(vk::PipelineBindPoint::eCompute, *computePipelineLayout, 0, *computeDescriptorSets[0], {}); + + // Dispatch compute shader + commandBufferRaii.dispatch(groupCountX, groupCountY, groupCountZ); + + // End command buffer + commandBufferRaii.end(); + + // Extract raw command buffer for submission and release RAII ownership + // This prevents premature destruction while preserving the recorded commands + vk::CommandBuffer rawCommandBuffer = *commandBufferRaii; + commandBufferRaii.release(); // Release RAII ownership to prevent destruction + + // Submit command buffer with fence for synchronization + SubmitToQueue2(*computeQueue, rawCommandBuffer, false, nullptr, *computeFence); + + // Return fence for non-blocking synchronization + return computeFence; + } catch (const std::exception& e) { + std::cerr << "Failed to dispatch compute shader: " << e.what() << std::endl; + // Return a null fence on error + vk::FenceCreateInfo fenceInfo{}; + return {device, fenceInfo}; + } +} diff --git a/attachments/sync2_engine/renderer_core.cpp b/attachments/sync2_engine/renderer_core.cpp new file mode 100644 index 00000000..8d8e99ab --- /dev/null +++ b/attachments/sync2_engine/renderer_core.cpp @@ -0,0 +1,1021 @@ +/* Copyright (c) 2025 Holochip Corporation + * + * SPDX-License-Identifier: Apache-2.0 + * + * Licensed under the Apache License, Version 2.0 the "License"; + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +#include "renderer.h" +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +VULKAN_HPP_DEFAULT_DISPATCH_LOADER_DYNAMIC_STORAGE; // In a .cpp file + +#include +#include // For PFN_vkGetInstanceProcAddr and C types +#include + +// Debug callback for vk::raii - uses raw Vulkan C types for cross-platform compatibility +static VKAPI_ATTR VkBool32 VKAPI_CALL debugCallbackVkRaii( + VkDebugUtilsMessageSeverityFlagBitsEXT messageSeverity, + [[maybe_unused]] VkDebugUtilsMessageTypeFlagsEXT messageType, + const VkDebugUtilsMessengerCallbackDataEXT* pCallbackData, + [[maybe_unused]] void* pUserData) { + if (messageSeverity >= VK_DEBUG_UTILS_MESSAGE_SEVERITY_WARNING_BIT_EXT) { + // Print a message to the console + std::cerr << "Validation layer: " << pCallbackData->pMessage << std::endl; + } else { + // Print a message to the console + std::cout << "Validation layer: " << pCallbackData->pMessage << std::endl; + } + + return VK_FALSE; +} + +// Vulkan-Hpp style callback signature for newer headers expecting vk:: types +static VKAPI_ATTR vk::Bool32 VKAPI_CALL debugCallbackVkHpp( + vk::DebugUtilsMessageSeverityFlagBitsEXT messageSeverity, + [[maybe_unused]] vk::DebugUtilsMessageTypeFlagsEXT messageType, + const vk::DebugUtilsMessengerCallbackDataEXT* pCallbackData, + [[maybe_unused]] void* pUserData) { + if (messageSeverity >= vk::DebugUtilsMessageSeverityFlagBitsEXT::eWarning) { + std::cerr << "Validation layer: " << pCallbackData->pMessage << std::endl; + } else { + std::cout << "Validation layer: " << pCallbackData->pMessage << std::endl; + } + return vk::False; +} + +// Watchdog thread function - monitors frame updates and aborts if application hangs +static void WatchdogThreadFunc(std::atomic* lastFrameTime, + std::atomic* running, + std::atomic* suppressed, + std::atomic* progressLabel, + std::atomic* progressIndex) { + while (running->load(std::memory_order_relaxed)) { + std::this_thread::sleep_for(std::chrono::seconds(5)); + + if (!running->load(std::memory_order_relaxed)) { + break; // Shutdown requested + } + + // Check if frame timestamp was updated recently. + // Some operations (e.g., BLAS/TLAS builds in Debug on large scenes) can legitimately take + // much longer than 5 or 10 seconds. When suppressed, allow a longer grace period. + auto now = std::chrono::steady_clock::now(); + auto lastUpdate = lastFrameTime->load(std::memory_order_relaxed); + auto elapsed = std::chrono::duration_cast(now - lastUpdate).count(); + const int64_t allowedSeconds = (suppressed && suppressed->load(std::memory_order_relaxed)) ? 300 : 60; + + if (elapsed >= allowedSeconds) { + // APPLICATION HAS HUNG - no frame updates for 10+ seconds + const char* label = nullptr; + if (progressLabel) { + label = progressLabel->load(std::memory_order_relaxed); + } + uint32_t idx = 0; + if (progressIndex) { + idx = progressIndex->load(std::memory_order_relaxed); + } + + std::cerr << "\n\n"; + std::cerr << "========================================\n"; + std::cerr << "WATCHDOG: APPLICATION HAS HUNG!\n"; + std::cerr << "========================================\n"; + std::cerr << "Last frame update was " << elapsed << " seconds ago.\n"; + if (suppressed && suppressed->load(std::memory_order_relaxed)) { + std::cerr << "Watchdog was SUPPRESSED (allowed " << allowedSeconds << "s)\n"; + } + if (label && label[0] != '\0') { + std::cerr << "Last progress marker: " << label << "\n"; + } + if (progressIndex) { + std::cerr << "Progress index: " << idx << "\n"; + } + std::cerr << "The render loop is not progressing.\n"; + std::cerr << "Aborting to generate stack trace...\n"; + std::cerr << "========================================\n\n"; + std::abort(); // Force crash with stack trace + } + } + + std::cout << "[Watchdog] Stopped\n"; +} + +// Renderer core implementation for the "Rendering Pipeline" chapter of the tutorial. +Renderer::Renderer(Platform* platform) : platform(platform) { + // Initialize deviceExtensions with required extensions only + // Optional extensions will be added later after checking device support + deviceExtensions = requiredDeviceExtensions; +} + +// Destructor +Renderer::~Renderer() { + Cleanup(); +} + +// Initialize the renderer +bool Renderer::Initialize(const std::string& appName, bool enableValidationLayers, bool debugSync) { + // Initialize the Vulkan-Hpp default dispatcher using the global symbol directly. + // This avoids differences across Vulkan-Hpp versions for DynamicLoader placement. + VULKAN_HPP_DEFAULT_DISPATCHER.init(vkGetInstanceProcAddr); + // Create a Vulkan instance + if (!createInstance(appName, enableValidationLayers, debugSync)) { + std::cerr << "Failed to create Vulkan instance" << std::endl; + return false; + } + + // Setup debug messenger + if (!setupDebugMessenger(enableValidationLayers)) { + std::cerr << "Failed to setup debug messenger" << std::endl; + return false; + } + + // Create surface + if (!createSurface()) { + std::cerr << "Failed to create surface" << std::endl; + return false; + } + + // Pick the physical device + if (!pickPhysicalDevice()) { + std::cerr << "Failed to pick physical device" << std::endl; + return false; + } + + // Create logical device + if (!createLogicalDevice(enableValidationLayers)) { + std::cerr << "Failed to create logical device" << std::endl; + return false; + } + + // Initialize memory pool for efficient memory management + try { + memoryPool = std::make_unique(device, physicalDevice); + if (!memoryPool->initialize()) { + std::cerr << "Failed to initialize memory pool" << std::endl; + return false; + } + } catch (const std::exception& e) { + std::cerr << "Failed to create memory pool: " << e.what() << std::endl; + return false; + } + + // Create timeline semaphore for frame-in-flight and cross-system synchronization + vk::SemaphoreTypeCreateInfo timelineCreateInfo{ + .semaphoreType = vk::SemaphoreType::eTimeline, + .initialValue = 0 + }; + vk::SemaphoreCreateInfo semaphoreCreateInfo{ + .pNext = &timelineCreateInfo + }; + frameTimeline = vk::raii::Semaphore(device, semaphoreCreateInfo); + + // Create swap chain + if (!createSwapChain()) { + std::cerr << "Failed to create swap chain" << std::endl; + return false; + } + + // Create image views + if (!createImageViews()) { + std::cerr << "Failed to create image views" << std::endl; + return false; + } + + // Setup dynamic rendering + if (!setupDynamicRendering()) { + std::cerr << "Failed to setup dynamic rendering" << std::endl; + return false; + } + + // Create the descriptor set layout + if (!createDescriptorSetLayout()) { + std::cerr << "Failed to create descriptor set layout" << std::endl; + return false; + } + + // Create the graphics pipeline + if (!createGraphicsPipeline()) { + std::cerr << "Failed to create graphics pipeline" << std::endl; + return false; + } + + // Create PBR pipeline + if (!createPBRPipeline()) { + std::cerr << "Failed to create PBR pipeline" << std::endl; + return false; + } + + // Create the lighting pipeline + if (!createLightingPipeline()) { + std::cerr << "Failed to create lighting pipeline" << std::endl; + return false; + } + + // Create composite pipeline (fullscreen pass for off-screen → swapchain) + if (!createCompositePipeline()) { + std::cerr << "Failed to create composite pipeline" << std::endl; + return false; + } + + // Create compute pipeline + if (!createComputePipeline()) { + std::cerr << "Failed to create compute pipeline" << std::endl; + return false; + } + + // Ensure light storage buffers exist before creating Forward+ resources + // so that compute descriptor binding 0 (lights SSBO) can be populated safely. + if (!createOrResizeLightStorageBuffers(1)) { + std::cerr << "Failed to create initial light storage buffers" << std::endl; + return false; + } + + // Create Forward+ compute and depth pre-pass pipelines/resources + if (useForwardPlus) { + if (!createForwardPlusPipelinesAndResources()) { + std::cerr << "Failed to create Forward+ resources" << std::endl; + return false; + } + } + + // Create ray query descriptor set layout and pipeline (but not resources yet - need descriptor pool first) + if (!createRayQueryDescriptorSetLayout()) { + std::cerr << "Failed to create ray query descriptor set layout" << std::endl; + return false; + } + if (!createRayQueryPipeline()) { + std::cerr << "Failed to create ray query pipeline" << std::endl; + return false; + } + + // Create the command pool + if (!createCommandPool()) { + std::cerr << "Failed to create command pool" << std::endl; + return false; + } + + // Create depth resources + if (!createDepthResources()) { + std::cerr << "Failed to create depth resources" << std::endl; + return false; + } + + if (useForwardPlus) { + if (!createDepthPrepassPipeline()) { + std::cerr << "Failed to create depth prepass pipeline" << std::endl; + return false; + } + } + + // Create the descriptor pool + if (!createDescriptorPool()) { + std::cerr << "Failed to create descriptor pool" << std::endl; + return false; + } + + // Create ray query resources AFTER descriptor pool (needs pool for descriptor set allocation) + if (!createRayQueryResources()) { + std::cerr << "Failed to create ray query resources" << std::endl; + return false; + } + + // Note: Acceleration structure build is requested by scene_loading.cpp after entities load + // No need to request it here during init + + // Light storage buffers were already created earlier to satisfy Forward+ binding requirements + + if (!createOpaqueSceneColorResources()) { + std::cerr << "Failed to create opaque scene color resources" << std::endl; + return false; + } + + createTransparentDescriptorSets(); + + // Create default texture resources + if (!createDefaultTextureResources()) { + std::cerr << "Failed to create default texture resources" << std::endl; + return false; + } + + // Create fallback transparent descriptor sets (must occur after default textures exist) + createTransparentFallbackDescriptorSets(); + + // Create shared default PBR textures (to avoid creating hundreds of identical textures) + if (!createSharedDefaultPBRTextures()) { + std::cerr << "Failed to create shared default PBR textures" << std::endl; + return false; + } + + // Create command buffers + if (!createCommandBuffers()) { + std::cerr << "Failed to create command buffers" << std::endl; + return false; + } + + // Create sync objects + if (!createSyncObjects()) { + std::cerr << "Failed to create sync objects" << std::endl; + return false; + } + + // Initialize background thread pool for async tasks (textures, etc.) AFTER all Vulkan resources are ready + try { + // Size the thread pool based on hardware concurrency, clamped to a sensible range + unsigned int hw = std::max(2u, std::min(8u, std::thread::hardware_concurrency() ? std::thread::hardware_concurrency() : 4u)); + threadPool = std::make_unique(hw); + } catch (const std::exception& e) { + std::cerr << "Failed to create thread pool: " << e.what() << std::endl; + return false; + } + + // Start background uploads worker now that queues/semaphores exist + StartUploadsWorker(); + + // Start watchdog thread to detect application hangs + lastFrameUpdateTime.store(std::chrono::steady_clock::now(), std::memory_order_relaxed); + watchdogRunning.store(true, std::memory_order_relaxed); + watchdogThread = std::thread(WatchdogThreadFunc, &lastFrameUpdateTime, &watchdogRunning, &watchdogSuppressed, &watchdogProgressLabel, &watchdogProgressIndex); + + std::cout << "[Watchdog] Started - will abort if no frame updates for 10+ seconds\n"; + + initialized = true; + return true; +} + +void Renderer::ensureThreadLocalVulkanInit() const { + // Initialize Vulkan-Hpp dispatcher per-thread; required for multi-threaded RAII usage + static thread_local bool s_tlsInitialized = false; + if (s_tlsInitialized) + return; + try { + // Initialize the dispatcher for this thread using the global symbol. + VULKAN_HPP_DEFAULT_DISPATCHER.init(vkGetInstanceProcAddr); + if (*instance) { + VULKAN_HPP_DEFAULT_DISPATCHER.init(*instance); + } + if (*device) { + VULKAN_HPP_DEFAULT_DISPATCHER.init(*device); + } + s_tlsInitialized = true; + } catch (...) { + // best-effort + } +} + +// Clean up renderer resources +void Renderer::KickWatchdog() { + lastFrameUpdateTime.store(std::chrono::steady_clock::now(), std::memory_order_relaxed); +} + +void Renderer::Cleanup() { + // Stop watchdog thread first to prevent false hang detection during shutdown + if (watchdogRunning.load(std::memory_order_relaxed)) { + watchdogRunning.store(false, std::memory_order_relaxed); + if (watchdogThread.joinable()) { + watchdogThread.join(); + } + } + + // Ensure background workers are stopped before tearing down Vulkan resources + StopUploadsWorker(); + + // Disallow any further descriptor writes during shutdown. + // This prevents late updates/frees racing against pool destruction. + descriptorSetsValid.store(false, std::memory_order_relaxed); { + std::lock_guard lk(pendingDescMutex); + pendingDescOps.clear(); + descriptorRefreshPending.store(false, std::memory_order_relaxed); + } { + std::unique_lock lock(threadPoolMutex); + if (threadPool) { + threadPool.reset(); + } + } + + if (!initialized) { + return; + } + + std::cout << "Starting renderer cleanup..." << std::endl; + + // Wait for the device to be idle before cleaning up + try { + WaitIdle(); + } catch (...) { + } + + // 1) Clean up any swapchain-scoped resources first + cleanupSwapChain(); + + // 2) Clear per-entity resources (descriptor sets and buffers) while descriptor pools still exist + for (auto& kv : entityResources) { + auto& resources = kv.second; + resources.basicDescriptorSets.clear(); + resources.pbrDescriptorSets.clear(); + resources.uniformBuffers.clear(); + resources.uniformBufferAllocations.clear(); + resources.uniformBuffersMapped.clear(); + resources.instanceBuffer = nullptr; + resources.instanceBufferAllocation = nullptr; + resources.instanceBufferMapped = nullptr; + } + entityResources.clear(); + + // 3) Clear any global descriptor sets that are allocated from pools to avoid dangling refs + transparentDescriptorSets.clear(); + transparentFallbackDescriptorSets.clear(); + compositeDescriptorSets.clear(); + computeDescriptorSets.clear(); + rqCompositeDescriptorSets.clear(); + + // 3.5) Clear ray query descriptor sets BEFORE destroying descriptor pool + // Without this, rayQueryDescriptorSets' RAII destructor tries to free them after + // the pool is destroyed, causing "Invalid VkDescriptorPool Object" validation errors + rayQueryDescriptorSets.clear(); + + // Ray Query composite sampler/sets are allocated from the shared descriptor pool. + // Ensure they are released before destroying the pool. + rqCompositeSampler = nullptr; + + // 4) Destroy/Reset pipelines and pipeline layouts (graphics/compute/forward+) + graphicsPipeline = nullptr; + pbrGraphicsPipeline = nullptr; + pbrBlendGraphicsPipeline = nullptr; + pbrPremulBlendGraphicsPipeline = nullptr; + pbrPrepassGraphicsPipeline = nullptr; + glassGraphicsPipeline = nullptr; + lightingPipeline = nullptr; + compositePipeline = nullptr; + forwardPlusPipeline = nullptr; + depthPrepassPipeline = nullptr; + + pipelineLayout = nullptr; + pbrPipelineLayout = nullptr; + lightingPipelineLayout = nullptr; + compositePipelineLayout = nullptr; + pbrTransparentPipelineLayout = nullptr; + forwardPlusPipelineLayout = nullptr; + + // 4.3) Ray query pipelines and layouts + rayQueryPipeline = nullptr; + rayQueryPipelineLayout = nullptr; + + // 4.5) Forward+ per-frame resources (including descriptor sets) must be released + // BEFORE destroying descriptor pools to avoid vkFreeDescriptorSets with invalid pool + for (auto& fp : forwardPlusPerFrame) { + fp.tileHeaders = nullptr; + fp.tileHeadersAlloc = nullptr; + fp.tileLightIndices = nullptr; + fp.tileLightIndicesAlloc = nullptr; + fp.params = nullptr; + fp.paramsAlloc = nullptr; + fp.paramsMapped = nullptr; + fp.debugOut = nullptr; + fp.debugOutAlloc = nullptr; + fp.probeOffscreen = nullptr; + fp.probeOffscreenAlloc = nullptr; + fp.probeSwapchain = nullptr; + fp.probeSwapchainAlloc = nullptr; + fp.computeSet = nullptr; // descriptor set allocated from compute/graphics pools + } + forwardPlusPerFrame.clear(); + + // 5) Destroy descriptor set layouts and pools (compute + graphics) + descriptorSetLayout = nullptr; + pbrDescriptorSetLayout = nullptr; + transparentDescriptorSetLayout = nullptr; + compositeDescriptorSetLayout = nullptr; + forwardPlusDescriptorSetLayout = nullptr; + computeDescriptorSetLayout = nullptr; + rayQueryDescriptorSetLayout = nullptr; + + // Pools last, after sets are cleared + computeDescriptorPool = nullptr; + descriptorPool = nullptr; + + // 6) Clear textures and aliases, including default resources + { + std::unique_lock lk(textureResourcesMutex); + textureResources.clear(); + textureAliases.clear(); + } + // Reset default texture resources + defaultTextureResources.textureSampler = nullptr; + defaultTextureResources.textureImageView = nullptr; + defaultTextureResources.textureImage = nullptr; + defaultTextureResources.textureImageAllocation = nullptr; + + // 7) Opaque scene color and related descriptors + opaqueSceneColorSampler = nullptr; + opaqueSceneColorImages.clear(); + opaqueSceneColorImageAllocations.clear(); + opaqueSceneColorImageViews.clear(); + opaqueSceneColorImageLayouts.clear(); + + // 7.5) Ray query output image and acceleration structures + rayQueryOutputImageView = nullptr; + rayQueryOutputImage = nullptr; + rayQueryOutputImageAllocation = nullptr; + + // Clear acceleration structures (BLAS and TLAS buffers) + blasStructures.clear(); + tlasStructure = AccelerationStructure{}; + + // 8) (moved above) Forward+ per-frame buffers cleared prior to pool destruction + + // 9) Command buffers/pools + commandBuffers.clear(); + commandPool = nullptr; + computeCommandPool = nullptr; + + // 10) Sync objects + imageAvailableSemaphores.clear(); + renderFinishedSemaphores.clear(); + inFlightFences.clear(); + uploadsTimeline = nullptr; + + // 11) Queues and surface (RAII handles will release upon reset; keep device alive until the end) + graphicsQueue = nullptr; + presentQueue = nullptr; + computeQueue = nullptr; + transferQueue = nullptr; + surface = nullptr; + + // 12) Memory pool last + memoryPool.reset(); + + // Finally mark uninitialized + initialized = false; + std::cout << "Renderer cleanup completed." << std::endl; +} + +// Create instance +bool Renderer::createInstance(const std::string& appName, bool enableValidationLayers, bool debugSync) { + try { + // Create application info + vk::ApplicationInfo appInfo{ + .pApplicationName = appName.c_str(), + .applicationVersion = VK_MAKE_VERSION(1, 0, 0), + .pEngineName = "Sync2 Engine", + .engineVersion = VK_MAKE_VERSION(1, 0, 0), + .apiVersion = VK_API_VERSION_1_4 + }; + + // Get required extensions + std::vector extensions; + + // Add required extensions for GLFW +#if defined(PLATFORM_DESKTOP) + uint32_t glfwExtensionCount = 0; + const char** glfwExtensions = glfwGetRequiredInstanceExtensions(&glfwExtensionCount); + extensions.insert(extensions.end(), glfwExtensions, glfwExtensions + glfwExtensionCount); +#endif + + // Add debug extension if validation layers are enabled + if (enableValidationLayers) { + extensions.push_back(VK_EXT_DEBUG_UTILS_EXTENSION_NAME); + } + + // Add validation features if debug sync is requested + if (debugSync) { + extensions.push_back(VK_EXT_VALIDATION_FEATURES_EXTENSION_NAME); + } + + // Create instance info + vk::InstanceCreateInfo createInfo{ + .pApplicationInfo = &appInfo, + .enabledExtensionCount = static_cast(extensions.size()), + .ppEnabledExtensionNames = extensions.data() + }; + + // Set up validation features if requested + vk::ValidationFeaturesEXT validationFeatures; + std::vector enabledValidationFeatures; + + if (debugSync) { + enabledValidationFeatures.push_back(vk::ValidationFeatureEnableEXT::eSynchronizationValidation); + } + + // Enable validation layers if requested + if (enableValidationLayers) { + if (!checkValidationLayerSupport()) { + std::cerr << "Validation layers requested, but not available" << std::endl; + return false; + } + + createInfo.enabledLayerCount = static_cast(validationLayers.size()); + createInfo.ppEnabledLayerNames = validationLayers.data(); + } + + if (!enabledValidationFeatures.empty()) { + validationFeatures.enabledValidationFeatureCount = static_cast(enabledValidationFeatures.size()); + validationFeatures.pEnabledValidationFeatures = enabledValidationFeatures.data(); + createInfo.pNext = &validationFeatures; + } + + // Create instance + instance = vk::raii::Instance(context, createInfo); + return true; + } catch (const std::exception& e) { + std::cerr << "Failed to create instance: " << e.what() << std::endl; + return false; + } +} + +// Setup debug messenger +bool Renderer::setupDebugMessenger(bool enableValidationLayers) { + if (!enableValidationLayers) { + return true; + } + + try { + // Create debug messenger info + vk::DebugUtilsMessengerCreateInfoEXT createInfo{}; + createInfo.messageSeverity = vk::DebugUtilsMessageSeverityFlagBitsEXT::eVerbose | + vk::DebugUtilsMessageSeverityFlagBitsEXT::eInfo | + vk::DebugUtilsMessageSeverityFlagBitsEXT::eWarning | + vk::DebugUtilsMessageSeverityFlagBitsEXT::eError; + createInfo.messageType = vk::DebugUtilsMessageTypeFlagBitsEXT::eGeneral | + vk::DebugUtilsMessageTypeFlagBitsEXT::eValidation | + vk::DebugUtilsMessageTypeFlagBitsEXT::ePerformance; + + // Select callback via simple platform macro: Android typically expects C PFN types in headers + // while desktop (newer Vulkan-Hpp) expects vk:: types. +#if defined(__ANDROID__) + createInfo.pfnUserCallback = &debugCallbackVkRaii; +#else + createInfo.pfnUserCallback = &debugCallbackVkHpp; +#endif + + // Create debug messenger + debugMessenger = vk::raii::DebugUtilsMessengerEXT(instance, createInfo); + return true; + } catch (const std::exception& e) { + std::cerr << "Failed to set up debug messenger: " << e.what() << std::endl; + return false; + } +} + +// Create surface +bool Renderer::createSurface() { + try { + // Create surface + VkSurfaceKHR _surface; + if (!platform->CreateVulkanSurface(*instance, &_surface)) { + std::cerr << "Failed to create window surface" << std::endl; + return false; + } + + surface = vk::raii::SurfaceKHR(instance, _surface); + return true; + } catch (const std::exception& e) { + std::cerr << "Failed to create surface: " << e.what() << std::endl; + return false; + } +} + +// Pick a physical device +bool Renderer::pickPhysicalDevice() { + try { + // Get available physical devices + std::vector devices = instance.enumeratePhysicalDevices(); + + if (devices.empty()) { + std::cerr << "Failed to find GPUs with Vulkan support" << std::endl; + return false; + } + + // Prioritize discrete GPUs (like NVIDIA RTX 2080) over integrated GPUs (like Intel UHD Graphics) + // First, collect all suitable devices with their suitability scores + std::multimap suitableDevices; + + for (auto& _device : devices) { + // Print device properties for debugging + vk::PhysicalDeviceProperties deviceProperties = _device.getProperties(); + std::cout << "Checking device: " << deviceProperties.deviceName + << " (Type: " << vk::to_string(deviceProperties.deviceType) << ")" << std::endl; + + // Check if the device supports Vulkan 1.3 + bool supportsVulkan1_3 = deviceProperties.apiVersion >= VK_API_VERSION_1_3; + if (!supportsVulkan1_3) { + std::cout << " - Does not support Vulkan 1.3" << std::endl; + continue; + } + + // Check queue families + QueueFamilyIndices indices = findQueueFamilies(_device); + bool supportsGraphics = indices.isComplete(); + if (!supportsGraphics) { + std::cout << " - Missing required queue families" << std::endl; + continue; + } + + // Check device extensions + bool supportsAllRequiredExtensions = checkDeviceExtensionSupport(_device); + if (!supportsAllRequiredExtensions) { + std::cout << " - Missing required extensions" << std::endl; + continue; + } + + // Check swap chain support + SwapChainSupportDetails swapChainSupport = querySwapChainSupport(_device); + bool swapChainAdequate = !swapChainSupport.formats.empty() && !swapChainSupport.presentModes.empty(); + if (!swapChainAdequate) { + std::cout << " - Inadequate swap chain support" << std::endl; + continue; + } + + // Check for required features + auto features = _device.getFeatures2(); + bool supportsRequiredFeatures = features.get().dynamicRendering; + if (!supportsRequiredFeatures) { + std::cout << " - Does not support required features (dynamicRendering)" << std::endl; + continue; + } + + // Calculate suitability score - prioritize discrete GPUs + int score = 0; + + // Discrete GPUs get the highest priority (NVIDIA RTX 2080, AMD, etc.) + if (deviceProperties.deviceType == vk::PhysicalDeviceType::eDiscreteGpu) { + score += 1000; + std::cout << " - Discrete GPU: +1000 points" << std::endl; + } + // Integrated GPUs get lower priority (Intel UHD Graphics, etc.) + else if (deviceProperties.deviceType == vk::PhysicalDeviceType::eIntegratedGpu) { + score += 100; + std::cout << " - Integrated GPU: +100 points" << std::endl; + } + + // Add points for memory size (more VRAM is better) + vk::PhysicalDeviceMemoryProperties memProperties = _device.getMemoryProperties(); + for (uint32_t i = 0; i < memProperties.memoryHeapCount; i++) { + if (memProperties.memoryHeaps[i].flags & vk::MemoryHeapFlagBits::eDeviceLocal) { + // Add 1 point per GB of VRAM + score += static_cast(memProperties.memoryHeaps[i].size / (1024 * 1024 * 1024)); + break; + } + } + + std::cout << " - Device is suitable with score: " << score << std::endl; + suitableDevices.emplace(score, _device); + } + + if (!suitableDevices.empty()) { + // Select the device with the highest score (discrete GPU with most VRAM) + physicalDevice = suitableDevices.rbegin()->second; + vk::PhysicalDeviceProperties deviceProperties = physicalDevice.getProperties(); + std::cout << "Selected device: " << deviceProperties.deviceName + << " (Type: " << vk::to_string(deviceProperties.deviceType) + << ", Score: " << suitableDevices.rbegin()->first << ")" << std::endl; + + // Store queue family indices for the selected device + queueFamilyIndices = findQueueFamilies(physicalDevice); + + // Add supported optional extensions + addSupportedOptionalExtensions(); + + return true; + } + std::cerr << "Failed to find a suitable GPU. Make sure your GPU supports Vulkan and has the required extensions." << std::endl; + return false; + } catch (const std::exception& e) { + std::cerr << "Failed to pick physical device: " << e.what() << std::endl; + return false; + } +} + +// Add supported optional extensions +void Renderer::addSupportedOptionalExtensions() { + try { + // Get available extensions + auto availableExtensions = physicalDevice.enumerateDeviceExtensionProperties(); + + // Build a set of available extension names for quick lookup + std::unordered_set avail; + for (const auto& e : availableExtensions) { + avail.insert(e.extensionName); + } + + // Set of already added extensions to avoid duplicates + std::unordered_set added(deviceExtensions.begin(), deviceExtensions.end()); + + for (const auto& optionalExt : optionalDeviceExtensions) { + if (avail.contains(optionalExt) && !added.contains(optionalExt)) { + deviceExtensions.push_back(optionalExt); + added.insert(optionalExt); + std::cout << "Adding optional extension: " << optionalExt << std::endl; + } + } + } catch (const std::exception& e) { + std::cerr << "Warning: Failed to add optional extensions: " << e.what() << std::endl; + } +} + +// Create logical device +bool Renderer::createLogicalDevice(bool enableValidationLayers) { + try { + // 1. Setup Queues + std::vector queueCreateInfos; + std::set uniqueQueueFamilies = { + queueFamilyIndices.graphicsFamily.value(), + queueFamilyIndices.presentFamily.value(), + queueFamilyIndices.computeFamily.value(), + queueFamilyIndices.transferFamily.value() + }; + float queuePriority = 1.0f; + for (uint32_t queueFamily : uniqueQueueFamilies) { + queueCreateInfos.push_back({ + .queueFamilyIndex = queueFamily, + .queueCount = 1, + .pQueuePriorities = &queuePriority + }); + } + + // 2. Query Supported Features + auto supported = physicalDevice.getFeatures2< + vk::PhysicalDeviceFeatures2, + vk::PhysicalDeviceVulkan11Features, + vk::PhysicalDeviceVulkan12Features, + vk::PhysicalDeviceVulkan13Features, + vk::PhysicalDeviceVulkan14Features, + vk::PhysicalDeviceRobustness2FeaturesEXT, + vk::PhysicalDeviceAccelerationStructureFeaturesKHR, + vk::PhysicalDeviceRayQueryFeaturesKHR + >(); + + // 3. Setup Enabled Features using StructureChain for stability and zero-init + // Core versioned features are included in the chain. + // Extensions that are NOT promoted to 1.4 are linked manually. + vk::StructureChain< + vk::PhysicalDeviceFeatures2, + vk::PhysicalDeviceVulkan11Features, + vk::PhysicalDeviceVulkan12Features, + vk::PhysicalDeviceVulkan13Features, + vk::PhysicalDeviceVulkan14Features + > enabledChain; + + auto& f2 = enabledChain.get(); + auto const& sCore = supported.get().features; + f2.features.samplerAnisotropy = vk::True; + f2.features.depthBiasClamp = sCore.depthBiasClamp; + f2.features.shaderSampledImageArrayDynamicIndexing = sCore.shaderSampledImageArrayDynamicIndexing; + f2.features.shaderInt64 = sCore.shaderInt64; // Required for RQ + + auto& f11 = enabledChain.get(); + f11.shaderDrawParameters = vk::True; + + auto& f12 = enabledChain.get(); + auto const& s12 = supported.get(); + f12.descriptorIndexing = vk::True; + f12.shaderSampledImageArrayNonUniformIndexing = s12.shaderSampledImageArrayNonUniformIndexing; + f12.descriptorBindingPartiallyBound = s12.descriptorBindingPartiallyBound; + f12.descriptorBindingUpdateUnusedWhilePending = s12.descriptorBindingUpdateUnusedWhilePending; + f12.descriptorBindingSampledImageUpdateAfterBind = s12.descriptorBindingSampledImageUpdateAfterBind; + f12.descriptorBindingUniformBufferUpdateAfterBind = s12.descriptorBindingUniformBufferUpdateAfterBind; + f12.timelineSemaphore = vk::True; + f12.vulkanMemoryModel = vk::True; + f12.vulkanMemoryModelDeviceScope = s12.vulkanMemoryModelDeviceScope; + f12.bufferDeviceAddress = vk::True; + f12.storageBuffer8BitAccess = s12.storageBuffer8BitAccess; + + auto& f13 = enabledChain.get(); + f13.dynamicRendering = vk::True; + f13.synchronization2 = vk::True; + + auto& f14 = enabledChain.get(); + auto const& s14 = supported.get(); + f14.hostImageCopy = vk::True; + f14.dynamicRenderingLocalRead = s14.dynamicRenderingLocalRead; + f14.maintenance5 = s14.maintenance5; + f14.maintenance6 = s14.maintenance6; + f14.pushDescriptor = s14.pushDescriptor; + + // 4. Link Extensions (not in 1.4 core) + void** lastNext = &f14.pNext; + + vk::PhysicalDeviceRobustness2FeaturesEXT enabledRobust2{}; + if (std::find_if(deviceExtensions.begin(), deviceExtensions.end(), [](const char* n){ return std::strcmp(n, VK_EXT_ROBUSTNESS_2_EXTENSION_NAME) == 0; }) != deviceExtensions.end()) { + enabledRobust2 = supported.get(); + *lastNext = &enabledRobust2; + lastNext = &enabledRobust2.pNext; + if (enabledRobust2.robustBufferAccess2) f2.features.robustBufferAccess = vk::True; + } + + vk::PhysicalDeviceAccelerationStructureFeaturesKHR enabledAS{}; + if (std::find_if(deviceExtensions.begin(), deviceExtensions.end(), [](const char* n){ return std::strcmp(n, VK_KHR_ACCELERATION_STRUCTURE_EXTENSION_NAME) == 0; }) != deviceExtensions.end()) { + enabledAS = supported.get(); + *lastNext = &enabledAS; + lastNext = &enabledAS.pNext; + } + + vk::PhysicalDeviceRayQueryFeaturesKHR enabledRQ{}; + if (std::find_if(deviceExtensions.begin(), deviceExtensions.end(), [](const char* n){ return std::strcmp(n, VK_KHR_RAY_QUERY_EXTENSION_NAME) == 0; }) != deviceExtensions.end()) { + enabledRQ = supported.get(); + *lastNext = &enabledRQ; + lastNext = &enabledRQ.pNext; + } + +#if !defined(PLATFORM_ANDROID) + vk::PhysicalDeviceShaderTileImageFeaturesEXT enabledTile{}; + if (std::find_if(deviceExtensions.begin(), deviceExtensions.end(), [](const char* n){ return std::strcmp(n, VK_EXT_SHADER_TILE_IMAGE_EXTENSION_NAME) == 0; }) != deviceExtensions.end()) { + auto s = physicalDevice.getFeatures2(); + enabledTile = s.get(); + *lastNext = &enabledTile; + lastNext = &enabledTile.pNext; + } +#endif + + *lastNext = nullptr; + + // 5. Create Logical Device + vk::DeviceCreateInfo createInfo{ + .pNext = &f2, + .queueCreateInfoCount = static_cast(queueCreateInfos.size()), + .pQueueCreateInfos = queueCreateInfos.data(), + .enabledExtensionCount = static_cast(deviceExtensions.size()), + .ppEnabledExtensionNames = deviceExtensions.data() + }; + + device = vk::raii::Device(physicalDevice, createInfo); + + // Query Acceleration Structure properties if supported + if (std::find_if(deviceExtensions.begin(), deviceExtensions.end(), [](const char* n) { + return std::strcmp(n, VK_KHR_ACCELERATION_STRUCTURE_EXTENSION_NAME) == 0; + }) != deviceExtensions.end()) { + auto propertiesChain = physicalDevice.getProperties2(); + accelStructProperties = propertiesChain.get(); + } + + // 6. Initialize Queues and semaphores + graphicsQueue = vk::raii::Queue(device, queueFamilyIndices.graphicsFamily.value(), 0); + presentQueue = vk::raii::Queue(device, queueFamilyIndices.presentFamily.value(), 0); + computeQueue = vk::raii::Queue(device, queueFamilyIndices.computeFamily.value(), 0); + transferQueue = vk::raii::Queue(device, queueFamilyIndices.transferFamily.value(), 0); + + // Record states + robustness2Enabled = (enabledRobust2.robustBufferAccess2 || enabledRobust2.robustImageAccess2 || enabledRobust2.nullDescriptor); + dynamicRenderingLocalReadEnabled = (f14.dynamicRenderingLocalRead == vk::True); + accelerationStructureEnabled = (enabledAS.accelerationStructure == vk::True); + rayQueryEnabled = (enabledRQ.rayQuery == vk::True); + std::cout << "[Renderer] Ray Query supported: " << (rayQueryEnabled ? "YES" : "NO") << std::endl; + std::cout << "[Renderer] Acceleration Structures supported: " << (accelerationStructureEnabled ? "YES" : "NO") << std::endl; + descriptorIndexingEnabled = (f12.descriptorIndexing == vk::True); + + vk::StructureChain timelineChain( + {}, + {.semaphoreType = vk::SemaphoreType::eTimeline, .initialValue = 0}); + uploadsTimeline = vk::raii::Semaphore(device, timelineChain.get()); + nextUploadTimelineValue.store(0, std::memory_order_relaxed); + nextFrameTimelineValue.store(0, std::memory_order_relaxed); + + initialized = true; + return true; + } catch (const std::exception& e) { + std::cerr << "Failed to create logical device: " << e.what() << std::endl; + return false; + } +} + +// Check validation layer support +bool Renderer::checkValidationLayerSupport() const { + // Get available layers + std::vector availableLayers = context.enumerateInstanceLayerProperties(); + + // Check if all requested layers are available + for (const char* layerName : validationLayers) { + bool layerFound = false; + + for (const auto& layerProperties : availableLayers) { + if (strcmp(layerName, layerProperties.layerName) == 0) { + layerFound = true; + break; + } + } + + if (!layerFound) { + return false; + } + } + + return true; +} diff --git a/attachments/sync2_engine/renderer_pipelines.cpp b/attachments/sync2_engine/renderer_pipelines.cpp new file mode 100644 index 00000000..346b3bec --- /dev/null +++ b/attachments/sync2_engine/renderer_pipelines.cpp @@ -0,0 +1,1409 @@ +/* Copyright (c) 2025 Holochip Corporation + * + * SPDX-License-Identifier: Apache-2.0 + * + * Licensed under the Apache License, Version 2.0 the "License"; + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +#include "mesh_component.h" +#include "renderer.h" +#include +#include +#include + +// This file contains pipeline-related methods from the Renderer class + +// Create a descriptor set layout +bool Renderer::createDescriptorSetLayout() { + try { + // Create binding for a uniform buffer + vk::DescriptorSetLayoutBinding uboLayoutBinding{ + .binding = 0, + .descriptorType = vk::DescriptorType::eUniformBuffer, + .descriptorCount = 1, + .stageFlags = vk::ShaderStageFlagBits::eVertex | vk::ShaderStageFlagBits::eFragment, + .pImmutableSamplers = nullptr + }; + + // Create binding for texture sampler + vk::DescriptorSetLayoutBinding samplerLayoutBinding{ + .binding = 1, + .descriptorType = vk::DescriptorType::eCombinedImageSampler, + .descriptorCount = 1, + .stageFlags = vk::ShaderStageFlagBits::eFragment, + .pImmutableSamplers = nullptr + }; + + // Create a descriptor set layout + std::array bindings = {uboLayoutBinding, samplerLayoutBinding}; + + // Descriptor indexing: set per-binding flags for UPDATE_AFTER_BIND if enabled + vk::DescriptorSetLayoutBindingFlagsCreateInfo bindingFlagsInfo{}; + std::array bindingFlags{}; + if (descriptorIndexingEnabled) { + bindingFlags[0] = vk::DescriptorBindingFlagBits::eUpdateAfterBind | vk::DescriptorBindingFlagBits::eUpdateUnusedWhilePending; + bindingFlags[1] = vk::DescriptorBindingFlagBits::eUpdateAfterBind | vk::DescriptorBindingFlagBits::eUpdateUnusedWhilePending; + bindingFlagsInfo.bindingCount = static_cast(bindingFlags.size()); + bindingFlagsInfo.pBindingFlags = bindingFlags.data(); + } + + vk::DescriptorSetLayoutCreateInfo layoutInfo{}; + layoutInfo.bindingCount = static_cast(bindings.size()); + layoutInfo.pBindings = bindings.data(); + if (descriptorIndexingEnabled) { + layoutInfo.flags |= vk::DescriptorSetLayoutCreateFlagBits::eUpdateAfterBindPool; + layoutInfo.pNext = &bindingFlagsInfo; + } + + descriptorSetLayout = vk::raii::DescriptorSetLayout(device, layoutInfo); + return true; + } catch (const std::exception& e) { + std::cerr << "Failed to create descriptor set layout: " << e.what() << std::endl; + return false; + } +} + +// Create PBR descriptor set layout +bool Renderer::createPBRDescriptorSetLayout() { + try { + // Create descriptor set layout bindings for PBR shader + std::array bindings = { + // Binding 0: Uniform buffer (UBO) + vk::DescriptorSetLayoutBinding{ + .binding = 0, + .descriptorType = vk::DescriptorType::eUniformBuffer, + .descriptorCount = 1, + .stageFlags = vk::ShaderStageFlagBits::eVertex | vk::ShaderStageFlagBits::eFragment, + .pImmutableSamplers = nullptr + }, + // Binding 1: Base color map and sampler + vk::DescriptorSetLayoutBinding{ + .binding = 1, + .descriptorType = vk::DescriptorType::eCombinedImageSampler, + .descriptorCount = 1, + .stageFlags = vk::ShaderStageFlagBits::eFragment, + .pImmutableSamplers = nullptr + }, + // Binding 2: Metallic roughness map and sampler + vk::DescriptorSetLayoutBinding{ + .binding = 2, + .descriptorType = vk::DescriptorType::eCombinedImageSampler, + .descriptorCount = 1, + .stageFlags = vk::ShaderStageFlagBits::eFragment, + .pImmutableSamplers = nullptr + }, + // Binding 3: Normal map and sampler + vk::DescriptorSetLayoutBinding{ + .binding = 3, + .descriptorType = vk::DescriptorType::eCombinedImageSampler, + .descriptorCount = 1, + .stageFlags = vk::ShaderStageFlagBits::eFragment, + .pImmutableSamplers = nullptr + }, + // Binding 4: Occlusion map and sampler + vk::DescriptorSetLayoutBinding{ + .binding = 4, + .descriptorType = vk::DescriptorType::eCombinedImageSampler, + .descriptorCount = 1, + .stageFlags = vk::ShaderStageFlagBits::eFragment, + .pImmutableSamplers = nullptr + }, + // Binding 5: Emissive map and sampler + vk::DescriptorSetLayoutBinding{ + .binding = 5, + .descriptorType = vk::DescriptorType::eCombinedImageSampler, + .descriptorCount = 1, + .stageFlags = vk::ShaderStageFlagBits::eFragment, + .pImmutableSamplers = nullptr + }, + // Binding 6: Light storage buffer (shadows removed) + vk::DescriptorSetLayoutBinding{ + .binding = 6, + .descriptorType = vk::DescriptorType::eStorageBuffer, + .descriptorCount = 1, + .stageFlags = vk::ShaderStageFlagBits::eFragment, + .pImmutableSamplers = nullptr + }, + // Binding 7: Forward+ tile headers SSBO + vk::DescriptorSetLayoutBinding{ + .binding = 7, + .descriptorType = vk::DescriptorType::eStorageBuffer, + .descriptorCount = 1, + .stageFlags = vk::ShaderStageFlagBits::eFragment, + .pImmutableSamplers = nullptr + }, + // Binding 8: Forward+ tile light indices SSBO + vk::DescriptorSetLayoutBinding{ + .binding = 8, + .descriptorType = vk::DescriptorType::eStorageBuffer, + .descriptorCount = 1, + .stageFlags = vk::ShaderStageFlagBits::eFragment, + .pImmutableSamplers = nullptr + }, + // Binding 9: Fragment debug output buffer (optional) + vk::DescriptorSetLayoutBinding{ + .binding = 9, + .descriptorType = vk::DescriptorType::eStorageBuffer, + .descriptorCount = 1, + .stageFlags = vk::ShaderStageFlagBits::eFragment, + .pImmutableSamplers = nullptr + }, + // Binding 10: Reflection texture (planar reflections) + vk::DescriptorSetLayoutBinding{ + .binding = 10, + .descriptorType = vk::DescriptorType::eCombinedImageSampler, + .descriptorCount = 1, + .stageFlags = vk::ShaderStageFlagBits::eFragment, + .pImmutableSamplers = nullptr + }, + // Binding 11: TLAS (ray-query shadows in raster fragment shader) + vk::DescriptorSetLayoutBinding{ + .binding = 11, + .descriptorType = vk::DescriptorType::eAccelerationStructureKHR, + .descriptorCount = 1, + .stageFlags = vk::ShaderStageFlagBits::eFragment, + .pImmutableSamplers = nullptr + }, + // Binding 12: Ray-query geometry info buffer (per-instance addresses + material indices) + vk::DescriptorSetLayoutBinding{ + .binding = 12, + .descriptorType = vk::DescriptorType::eStorageBuffer, + .descriptorCount = 1, + .stageFlags = vk::ShaderStageFlagBits::eFragment, + .pImmutableSamplers = nullptr + }, + // Binding 13: Ray-query material buffer (PBR material properties) + vk::DescriptorSetLayoutBinding{ + .binding = 13, + .descriptorType = vk::DescriptorType::eStorageBuffer, + .descriptorCount = 1, + .stageFlags = vk::ShaderStageFlagBits::eFragment, + .pImmutableSamplers = nullptr + } + }; + + // Create a descriptor set layout + // Descriptor indexing: set per-binding flags for UPDATE_AFTER_BIND on UBO (0) and sampled images (1..5) + vk::DescriptorSetLayoutBindingFlagsCreateInfo bindingFlagsInfo{}; + std::array bindingFlags{}; + if (descriptorIndexingEnabled) { + bindingFlags[0] = vk::DescriptorBindingFlagBits::eUpdateAfterBind | vk::DescriptorBindingFlagBits::eUpdateUnusedWhilePending; + bindingFlags[1] = vk::DescriptorBindingFlagBits::eUpdateAfterBind | vk::DescriptorBindingFlagBits::eUpdateUnusedWhilePending; + bindingFlags[10] = vk::DescriptorBindingFlagBits::eUpdateAfterBind | vk::DescriptorBindingFlagBits::eUpdateUnusedWhilePending; + bindingFlagsInfo.bindingCount = static_cast(bindingFlags.size()); + bindingFlagsInfo.pBindingFlags = bindingFlags.data(); + } + + vk::DescriptorSetLayoutCreateInfo layoutInfo{}; + layoutInfo.bindingCount = static_cast(bindings.size()); + layoutInfo.pBindings = bindings.data(); + if (descriptorIndexingEnabled) { + layoutInfo.flags |= vk::DescriptorSetLayoutCreateFlagBits::eUpdateAfterBindPool; + layoutInfo.pNext = &bindingFlagsInfo; + } + + pbrDescriptorSetLayout = vk::raii::DescriptorSetLayout(device, layoutInfo); + + // Binding 7: transparent passes input + // Layout for Set 1: Just the scene color texture + vk::DescriptorSetLayoutBinding sceneColorBinding{ + .binding = 0, .descriptorType = vk::DescriptorType::eCombinedImageSampler, .descriptorCount = 1, .stageFlags = vk::ShaderStageFlagBits::eFragment + }; + vk::DescriptorSetLayoutCreateInfo transparentLayoutInfo{.bindingCount = 1, .pBindings = &sceneColorBinding}; + if (descriptorIndexingEnabled) { + // Make this sampler binding update-after-bind safe as well (optional) + vk::DescriptorSetLayoutBindingFlagsCreateInfo transBindingFlagsInfo{}; + vk::DescriptorBindingFlags transFlags = vk::DescriptorBindingFlagBits::eUpdateAfterBind | vk::DescriptorBindingFlagBits::eUpdateUnusedWhilePending; + transBindingFlagsInfo.bindingCount = 1; + transBindingFlagsInfo.pBindingFlags = &transFlags; + transparentLayoutInfo.flags |= vk::DescriptorSetLayoutCreateFlagBits::eUpdateAfterBindPool; + transparentLayoutInfo.pNext = &transBindingFlagsInfo; + + // Create the layout while the pNext chain is still valid (avoid dangling pointer) + transparentDescriptorSetLayout = vk::raii::DescriptorSetLayout(device, transparentLayoutInfo); + } else { + // Create without extra binding flags + transparentDescriptorSetLayout = vk::raii::DescriptorSetLayout(device, transparentLayoutInfo); + } + + return true; + } catch (const std::exception& e) { + std::cerr << "Failed to create PBR descriptor set layout: " << e.what() << std::endl; + return false; + } +} + +// Create a graphics pipeline +bool Renderer::createGraphicsPipeline() { + try { + // Read shader code + auto shaderCode = readFile("shaders/texturedMesh.spv"); + + // Create shader modules + vk::raii::ShaderModule shaderModule = createShaderModule(shaderCode); + + // Create shader stage info + vk::PipelineShaderStageCreateInfo vertShaderStageInfo{ + .stage = vk::ShaderStageFlagBits::eVertex, + .module = *shaderModule, + .pName = "VSMain" + }; + + vk::PipelineShaderStageCreateInfo fragShaderStageInfo{ + .stage = vk::ShaderStageFlagBits::eFragment, + .module = *shaderModule, + .pName = "PSMain" + }; + + // Fragment entry point specialized for architectural glass + vk::PipelineShaderStageCreateInfo fragGlassStageInfo{ + .stage = vk::ShaderStageFlagBits::eFragment, + .module = *shaderModule, + .pName = "GlassPSMain" + }; + + vk::PipelineShaderStageCreateInfo shaderStages[] = {vertShaderStageInfo, fragShaderStageInfo}; + + // Create vertex input info with instancing support + auto vertexBindingDescription = Vertex::getBindingDescription(); + auto instanceBindingDescription = InstanceData::getBindingDescription(); + std::array bindingDescriptions = { + vertexBindingDescription, + instanceBindingDescription + }; + + auto vertexAttributeDescriptions = Vertex::getAttributeDescriptions(); + auto instanceAttributeDescriptions = InstanceData::getAttributeDescriptions(); + + // Combine all attribute descriptions (no duplicates) + std::vector allAttributeDescriptions; + allAttributeDescriptions.insert(allAttributeDescriptions.end(), vertexAttributeDescriptions.begin(), vertexAttributeDescriptions.end()); + allAttributeDescriptions.insert(allAttributeDescriptions.end(), instanceAttributeDescriptions.begin(), instanceAttributeDescriptions.end()); + + // Note: materialIndex attribute (Location 11) is not used by current shaders + + vk::PipelineVertexInputStateCreateInfo vertexInputInfo{ + .vertexBindingDescriptionCount = static_cast(bindingDescriptions.size()), + .pVertexBindingDescriptions = bindingDescriptions.data(), + .vertexAttributeDescriptionCount = static_cast(allAttributeDescriptions.size()), + .pVertexAttributeDescriptions = allAttributeDescriptions.data() + }; + + // Create input assembly info + vk::PipelineInputAssemblyStateCreateInfo inputAssembly{ + .topology = vk::PrimitiveTopology::eTriangleList, + .primitiveRestartEnable = VK_FALSE + }; + + // Create viewport state info + vk::PipelineViewportStateCreateInfo viewportState{ + .viewportCount = 1, + .scissorCount = 1 + }; + + // Create rasterization state info + vk::PipelineRasterizationStateCreateInfo rasterizer{ + .depthClampEnable = VK_FALSE, + .rasterizerDiscardEnable = VK_FALSE, + .polygonMode = vk::PolygonMode::eFill, + .cullMode = vk::CullModeFlagBits::eNone, + .frontFace = vk::FrontFace::eCounterClockwise, + .depthBiasEnable = VK_FALSE, + .lineWidth = 1.0f + }; + + // Create multisample state info + vk::PipelineMultisampleStateCreateInfo multisampling{ + .rasterizationSamples = vk::SampleCountFlagBits::e1, + .sampleShadingEnable = VK_FALSE + }; + + // Create depth stencil state info + vk::PipelineDepthStencilStateCreateInfo depthStencil{ + .depthTestEnable = VK_TRUE, + .depthWriteEnable = VK_TRUE, + // Use LessOrEqual so that the main shading pass works after a depth pre-pass + .depthCompareOp = vk::CompareOp::eLessOrEqual, + .depthBoundsTestEnable = VK_FALSE, + .stencilTestEnable = VK_FALSE + }; + + // Create a color blend attachment state + vk::PipelineColorBlendAttachmentState colorBlendAttachment{ + .blendEnable = VK_FALSE, + .colorWriteMask = vk::ColorComponentFlagBits::eR | vk::ColorComponentFlagBits::eG | vk::ColorComponentFlagBits::eB | vk::ColorComponentFlagBits::eA + }; + + // Create color blend state info + vk::PipelineColorBlendStateCreateInfo colorBlending{ + .logicOpEnable = VK_FALSE, + .logicOp = vk::LogicOp::eCopy, + .attachmentCount = 1, + .pAttachments = &colorBlendAttachment + }; + + // Create dynamic state info + std::vector dynamicStates = { + vk::DynamicState::eViewport, + vk::DynamicState::eScissor + }; + + vk::PipelineDynamicStateCreateInfo dynamicState{ + .dynamicStateCount = static_cast(dynamicStates.size()), + .pDynamicStates = dynamicStates.data() + }; + + // Create pipeline layout + vk::PipelineLayoutCreateInfo pipelineLayoutInfo{ + .setLayoutCount = 1, + .pSetLayouts = &*descriptorSetLayout, + .pushConstantRangeCount = 0, + .pPushConstantRanges = nullptr + }; + + pipelineLayout = vk::raii::PipelineLayout(device, pipelineLayoutInfo); + + // Create pipeline rendering info + vk::Format depthFormat = findDepthFormat(); + std::cout << "Creating main graphics pipeline with depth format: " << static_cast(depthFormat) << std::endl; + + // Initialize member variable for proper lifetime management + mainPipelineRenderingCreateInfo = vk::PipelineRenderingCreateInfo{ + .colorAttachmentCount = 1, + .pColorAttachmentFormats = &swapChainImageFormat, + .depthAttachmentFormat = depthFormat, + .stencilAttachmentFormat = vk::Format::eUndefined + }; + + // Create the graphics pipeline + vk::PipelineRasterizationStateCreateInfo rasterizerBack = rasterizer; + // Disable back-face culling for opaque PBR to avoid disappearing geometry when + // instance/model transforms flip winding (ensures PASS 1 actually shades pixels) + rasterizerBack.cullMode = vk::CullModeFlagBits::eNone; + + vk::GraphicsPipelineCreateInfo pipelineInfo{ + .pNext = &mainPipelineRenderingCreateInfo, + .flags = vk::PipelineCreateFlags{}, + .stageCount = 2, + .pStages = shaderStages, + .pVertexInputState = &vertexInputInfo, + .pInputAssemblyState = &inputAssembly, + .pViewportState = &viewportState, + .pRasterizationState = &rasterizerBack, + .pMultisampleState = &multisampling, + .pDepthStencilState = &depthStencil, + .pColorBlendState = &colorBlending, + .pDynamicState = &dynamicState, + .layout = *pipelineLayout, + .renderPass = nullptr, + .subpass = 0, + .basePipelineHandle = nullptr, + .basePipelineIndex = -1 + }; + + graphicsPipeline = vk::raii::Pipeline(device, nullptr, pipelineInfo); + return true; + } catch (const std::exception& e) { + std::cerr << "Failed to create graphics pipeline: " << e.what() << std::endl; + return false; + } +} + +// Create PBR pipeline +bool Renderer::createPBRPipeline() { + try { + // Create PBR descriptor set layout + if (!createPBRDescriptorSetLayout()) { + return false; + } + + // Read shader code + auto shaderCode = readFile("shaders/pbr.spv"); + + // Create shader modules + vk::raii::ShaderModule shaderModule = createShaderModule(shaderCode); + + // Create shader stage info + vk::PipelineShaderStageCreateInfo vertShaderStageInfo{ + .stage = vk::ShaderStageFlagBits::eVertex, + .module = *shaderModule, + .pName = "VSMain" + }; + + vk::PipelineShaderStageCreateInfo fragShaderStageInfo{ + .stage = vk::ShaderStageFlagBits::eFragment, + .module = *shaderModule, + .pName = "PSMain" + }; + + // Fragment entry point specialized for architectural glass + vk::PipelineShaderStageCreateInfo fragGlassStageInfo{ + .stage = vk::ShaderStageFlagBits::eFragment, + .module = *shaderModule, + .pName = "GlassPSMain" + }; + + vk::PipelineShaderStageCreateInfo shaderStages[] = {vertShaderStageInfo, fragShaderStageInfo}; + + // Define vertex and instance binding descriptions + auto vertexBindingDescription = Vertex::getBindingDescription(); + auto instanceBindingDescription = InstanceData::getBindingDescription(); + std::array bindingDescriptions = { + vertexBindingDescription, + instanceBindingDescription + }; + + // Define vertex and instance attribute descriptions + auto vertexAttributeDescriptions = Vertex::getAttributeDescriptions(); + auto instanceModelMatrixAttributes = InstanceData::getModelMatrixAttributeDescriptions(); + auto instanceNormalMatrixAttributes = InstanceData::getNormalMatrixAttributeDescriptions(); + + // Combine all attribute descriptions + std::vector allAttributeDescriptions; + allAttributeDescriptions.insert(allAttributeDescriptions.end(), vertexAttributeDescriptions.begin(), vertexAttributeDescriptions.end()); + allAttributeDescriptions.insert(allAttributeDescriptions.end(), instanceModelMatrixAttributes.begin(), instanceModelMatrixAttributes.end()); + allAttributeDescriptions.insert(allAttributeDescriptions.end(), instanceNormalMatrixAttributes.begin(), instanceNormalMatrixAttributes.end()); + + vk::PipelineVertexInputStateCreateInfo vertexInputInfo{ + .vertexBindingDescriptionCount = static_cast(bindingDescriptions.size()), + .pVertexBindingDescriptions = bindingDescriptions.data(), + .vertexAttributeDescriptionCount = static_cast(allAttributeDescriptions.size()), + .pVertexAttributeDescriptions = allAttributeDescriptions.data() + }; + + // Create input assembly info + vk::PipelineInputAssemblyStateCreateInfo inputAssembly{ + .topology = vk::PrimitiveTopology::eTriangleList, + .primitiveRestartEnable = VK_FALSE + }; + + // Create viewport state info + vk::PipelineViewportStateCreateInfo viewportState{ + .viewportCount = 1, + .scissorCount = 1 + }; + + // Create rasterization state info + vk::PipelineRasterizationStateCreateInfo rasterizer{ + .depthClampEnable = VK_FALSE, + .rasterizerDiscardEnable = VK_FALSE, + .polygonMode = vk::PolygonMode::eFill, + .cullMode = vk::CullModeFlagBits::eNone, + .frontFace = vk::FrontFace::eCounterClockwise, + .depthBiasEnable = VK_FALSE, + .lineWidth = 1.0f + }; + + // Create multisample state info + vk::PipelineMultisampleStateCreateInfo multisampling{ + .rasterizationSamples = vk::SampleCountFlagBits::e1, + .sampleShadingEnable = VK_FALSE + }; + + // Create depth stencil state info + vk::PipelineDepthStencilStateCreateInfo depthStencil{ + .depthTestEnable = VK_TRUE, + .depthWriteEnable = VK_TRUE, + .depthCompareOp = vk::CompareOp::eLess, + .depthBoundsTestEnable = VK_FALSE, + .stencilTestEnable = VK_FALSE + }; + + // Create a color blend attachment state + vk::PipelineColorBlendAttachmentState colorBlendAttachment{ + .blendEnable = VK_FALSE, + .colorWriteMask = vk::ColorComponentFlagBits::eR | vk::ColorComponentFlagBits::eG | vk::ColorComponentFlagBits::eB | vk::ColorComponentFlagBits::eA + }; + + // Create color blend state info + vk::PipelineColorBlendStateCreateInfo colorBlending{ + .logicOpEnable = VK_FALSE, + .logicOp = vk::LogicOp::eCopy, + .attachmentCount = 1, + .pAttachments = &colorBlendAttachment + }; + + // Create dynamic state info + std::vector dynamicStates = { + vk::DynamicState::eViewport, + vk::DynamicState::eScissor + }; + + vk::PipelineDynamicStateCreateInfo dynamicState{ + .dynamicStateCount = static_cast(dynamicStates.size()), + .pDynamicStates = dynamicStates.data() + }; + + // Create push constant range for material properties + vk::PushConstantRange pushConstantRange{ + .stageFlags = vk::ShaderStageFlagBits::eFragment, + .offset = 0, + .size = sizeof(MaterialProperties) + }; + + std::array transparentSetLayouts = {*pbrDescriptorSetLayout, *transparentDescriptorSetLayout}; + // Create a pipeline layout for opaque PBR with only the PBR descriptor set (set 0) + std::array pbrOnlySetLayouts = {*pbrDescriptorSetLayout}; + // Create BOTH pipeline layouts with two descriptor sets (PBR set 0 + scene color set 1) + vk::PipelineLayoutCreateInfo pipelineLayoutInfo{ + .setLayoutCount = static_cast(transparentSetLayouts.size()), + .pSetLayouts = transparentSetLayouts.data(), + .pushConstantRangeCount = 1, + .pPushConstantRanges = &pushConstantRange + }; + + pbrPipelineLayout = vk::raii::PipelineLayout(device, pipelineLayoutInfo); + + // Transparent PBR layout uses the same two-set layout + vk::PipelineLayoutCreateInfo transparentPipelineLayoutInfo{.setLayoutCount = static_cast(transparentSetLayouts.size()), .pSetLayouts = transparentSetLayouts.data(), .pushConstantRangeCount = 1, .pPushConstantRanges = &pushConstantRange}; + pbrTransparentPipelineLayout = vk::raii::PipelineLayout(device, transparentPipelineLayoutInfo); + + // Create pipeline rendering info + vk::Format depthFormat = findDepthFormat(); + + // Initialize member variable for proper lifetime management + pbrPipelineRenderingCreateInfo = vk::PipelineRenderingCreateInfo{ + .colorAttachmentCount = 1, + .pColorAttachmentFormats = &swapChainImageFormat, + .depthAttachmentFormat = depthFormat, + .stencilAttachmentFormat = vk::Format::eUndefined + }; + + // 1) Opaque PBR pipeline (no blending, depth writes enabled) + vk::PipelineColorBlendAttachmentState opaqueBlendAttachment = colorBlendAttachment; + opaqueBlendAttachment.blendEnable = VK_FALSE; + vk::PipelineColorBlendStateCreateInfo colorBlendingOpaque{ + .logicOpEnable = VK_FALSE, + .logicOp = vk::LogicOp::eCopy, + .attachmentCount = 1, + .pAttachments = &opaqueBlendAttachment + }; + vk::PipelineDepthStencilStateCreateInfo depthStencilOpaque = depthStencil; + depthStencilOpaque.depthWriteEnable = VK_TRUE; + + vk::PipelineRasterizationStateCreateInfo rasterizerBack = rasterizer; + rasterizerBack.cullMode = vk::CullModeFlagBits::eBack; + + // For architectural glass we often want to see both the inner and outer + // walls of thin shells (e.g., bar glasses viewed from above). Use + // no culling for the glass pipeline to render both sides, while + // keeping back-face culling for the generic PBR pipelines. + vk::PipelineRasterizationStateCreateInfo rasterizerGlass = rasterizer; + rasterizerGlass.cullMode = vk::CullModeFlagBits::eNone; + + vk::GraphicsPipelineCreateInfo opaquePipelineInfo{ + + .pNext = &pbrPipelineRenderingCreateInfo, + .flags = vk::PipelineCreateFlags{}, + .stageCount = 2, + .pStages = shaderStages, + .pVertexInputState = &vertexInputInfo, + .pInputAssemblyState = &inputAssembly, + .pViewportState = &viewportState, + .pRasterizationState = &rasterizerBack, + .pMultisampleState = &multisampling, + .pDepthStencilState = &depthStencilOpaque, + .pColorBlendState = &colorBlendingOpaque, + .pDynamicState = &dynamicState, + .layout = *pbrPipelineLayout, + .renderPass = nullptr, + .subpass = 0, + .basePipelineHandle = nullptr, + .basePipelineIndex = -1 + }; + pbrGraphicsPipeline = vk::raii::Pipeline(device, nullptr, opaquePipelineInfo); + + // 1b) Opaque PBR pipeline variant for color pass after a depth pre-pass. + // Depth writes disabled (read-only) and compare against pre-pass depth. + vk::PipelineDepthStencilStateCreateInfo depthStencilAfterPrepass = depthStencil; + depthStencilAfterPrepass.depthTestEnable = VK_TRUE; + depthStencilAfterPrepass.depthWriteEnable = VK_FALSE; + depthStencilAfterPrepass.depthCompareOp = vk::CompareOp::eEqual; + + vk::GraphicsPipelineCreateInfo opaqueAfterPrepassInfo{ + + .pNext = &pbrPipelineRenderingCreateInfo, + .flags = vk::PipelineCreateFlags{}, + .stageCount = 2, + .pStages = shaderStages, + .pVertexInputState = &vertexInputInfo, + .pInputAssemblyState = &inputAssembly, + .pViewportState = &viewportState, + .pRasterizationState = &rasterizerBack, + .pMultisampleState = &multisampling, + .pDepthStencilState = &depthStencilAfterPrepass, + .pColorBlendState = &colorBlendingOpaque, + .pDynamicState = &dynamicState, + .layout = *pbrPipelineLayout, + .renderPass = nullptr, + .subpass = 0, + .basePipelineHandle = nullptr, + .basePipelineIndex = -1 + }; + pbrPrepassGraphicsPipeline = vk::raii::Pipeline(device, nullptr, opaqueAfterPrepassInfo); + + // 1c) Reflection PBR pipeline for mirrored off-screen pass (cull none to avoid winding issues) + vk::PipelineRasterizationStateCreateInfo rasterizerReflection = rasterizer; + rasterizerReflection.cullMode = vk::CullModeFlagBits::eNone; + vk::GraphicsPipelineCreateInfo reflectionPipelineInfo{ + + .pNext = &pbrPipelineRenderingCreateInfo, + .flags = vk::PipelineCreateFlags{}, + .stageCount = 2, + .pStages = shaderStages, + .pVertexInputState = &vertexInputInfo, + .pInputAssemblyState = &inputAssembly, + .pViewportState = &viewportState, + .pRasterizationState = &rasterizerReflection, + .pMultisampleState = &multisampling, + .pDepthStencilState = &depthStencilOpaque, + .pColorBlendState = &colorBlendingOpaque, + .pDynamicState = &dynamicState, + .layout = *pbrPipelineLayout, + .renderPass = nullptr, + .subpass = 0, + .basePipelineHandle = nullptr, + .basePipelineIndex = -1 + }; + pbrReflectionGraphicsPipeline = vk::raii::Pipeline(device, nullptr, reflectionPipelineInfo); + + // 2) Blended PBR pipeline (straight alpha blending, depth writes disabled for translucency) + vk::PipelineColorBlendAttachmentState blendedAttachment = colorBlendAttachment; + blendedAttachment.blendEnable = VK_TRUE; + // Straight alpha blending: out.rgb = src.rgb*src.a + dst.rgb*(1-src.a) + blendedAttachment.srcColorBlendFactor = vk::BlendFactor::eSrcAlpha; + blendedAttachment.dstColorBlendFactor = vk::BlendFactor::eOneMinusSrcAlpha; + // Alpha channel keeps destination scaled by inverse src alpha + blendedAttachment.srcAlphaBlendFactor = vk::BlendFactor::eOne; + blendedAttachment.dstAlphaBlendFactor = vk::BlendFactor::eOneMinusSrcAlpha; + vk::PipelineColorBlendStateCreateInfo colorBlendingBlended{.attachmentCount = 1, .pAttachments = &blendedAttachment}; + vk::PipelineDepthStencilStateCreateInfo depthStencilBlended = depthStencil; + depthStencilBlended.depthWriteEnable = VK_FALSE; + depthStencilBlended.depthCompareOp = vk::CompareOp::eLessOrEqual; + + vk::GraphicsPipelineCreateInfo blendedPipelineInfo{ + + .pNext = &pbrPipelineRenderingCreateInfo, + .flags = vk::PipelineCreateFlags{}, + .stageCount = 2, + .pStages = shaderStages, + .pVertexInputState = &vertexInputInfo, + .pInputAssemblyState = &inputAssembly, + .pViewportState = &viewportState, + // Use back-face culling for the blended (glass) pipeline to avoid + // rendering both front and back faces of thin glass geometry, which + // can cause flickering as the camera rotates due to overlapping + // transparent surfaces passing the depth test. + .pRasterizationState = &rasterizerBack, + .pMultisampleState = &multisampling, + .pDepthStencilState = &depthStencilBlended, + .pColorBlendState = &colorBlendingBlended, + .pDynamicState = &dynamicState, + .layout = *pbrTransparentPipelineLayout, + .renderPass = nullptr, + .subpass = 0, + .basePipelineHandle = nullptr, + .basePipelineIndex = -1 + }; + pbrBlendGraphicsPipeline = vk::raii::Pipeline(device, nullptr, blendedPipelineInfo); + + // 3) Glass pipeline (architectural glass) - uses the same vertex input and + // descriptor layouts, but a dedicated fragment shader entry point + // (GlassPSMain) for more stable glass shading. + vk::PipelineShaderStageCreateInfo glassStages[] = {vertShaderStageInfo, fragGlassStageInfo}; + + vk::GraphicsPipelineCreateInfo glassPipelineInfo{ + + .pNext = &pbrPipelineRenderingCreateInfo, + .flags = vk::PipelineCreateFlags{}, + .stageCount = 2, + .pStages = glassStages, + .pVertexInputState = &vertexInputInfo, + .pInputAssemblyState = &inputAssembly, + .pViewportState = &viewportState, + .pRasterizationState = &rasterizerGlass, + .pMultisampleState = &multisampling, + .pDepthStencilState = &depthStencilBlended, + .pColorBlendState = &colorBlendingBlended, + .pDynamicState = &dynamicState, + .layout = *pbrTransparentPipelineLayout, + .renderPass = nullptr, + .subpass = 0, + .basePipelineHandle = nullptr, + .basePipelineIndex = -1 + }; + glassGraphicsPipeline = vk::raii::Pipeline(device, nullptr, glassPipelineInfo); + + return true; + } catch (const std::exception& e) { + std::cerr << "Failed to create PBR pipeline: " << e.what() << std::endl; + return false; + } +} + +// Create fullscreen composite pipeline (samples off-screen color and writes to swapchain) +bool Renderer::createCompositePipeline() { + try { + // Reuse the transparent descriptor set layout (binding 0 = combined image sampler) + if (*transparentDescriptorSetLayout == nullptr) { + // Ensure PBR pipeline path created it + if (!createPBRPipeline()) { + return false; + } + } + + // Read composite shader code + auto shaderCode = readFile("shaders/composite.spv"); + vk::raii::ShaderModule shaderModule = createShaderModule(shaderCode); + + // Shader stages + vk::PipelineShaderStageCreateInfo vert{ + .stage = vk::ShaderStageFlagBits::eVertex, + .module = *shaderModule, + .pName = "VSMain" + }; + vk::PipelineShaderStageCreateInfo frag{ + .stage = vk::ShaderStageFlagBits::eFragment, + .module = *shaderModule, + .pName = "PSMain" + }; + vk::PipelineShaderStageCreateInfo stages[] = {vert, frag}; + + // No vertex inputs (fullscreen triangle via SV_VertexID) + vk::PipelineVertexInputStateCreateInfo vertexInput{}; + vk::PipelineInputAssemblyStateCreateInfo inputAssembly{.topology = vk::PrimitiveTopology::eTriangleList}; + vk::PipelineViewportStateCreateInfo viewportState{.viewportCount = 1, .scissorCount = 1}; + vk::PipelineRasterizationStateCreateInfo rasterizer{.polygonMode = vk::PolygonMode::eFill, .cullMode = vk::CullModeFlagBits::eNone, .frontFace = vk::FrontFace::eCounterClockwise, .lineWidth = 1.0f}; + vk::PipelineMultisampleStateCreateInfo multisampling{.rasterizationSamples = vk::SampleCountFlagBits::e1}; + // No depth + vk::PipelineDepthStencilStateCreateInfo depthStencil{.depthTestEnable = VK_FALSE, .depthWriteEnable = VK_FALSE}; + // No blending (we clear swapchain before this and blend transparents later) + vk::PipelineColorBlendAttachmentState attachment{ + .blendEnable = VK_FALSE, + .colorWriteMask = vk::ColorComponentFlagBits::eR | vk::ColorComponentFlagBits::eG | + vk::ColorComponentFlagBits::eB | vk::ColorComponentFlagBits::eA + }; + vk::PipelineColorBlendStateCreateInfo colorBlending{.attachmentCount = 1, .pAttachments = &attachment}; + std::array dynStates = {vk::DynamicState::eViewport, vk::DynamicState::eScissor}; + vk::PipelineDynamicStateCreateInfo dynamicState{.dynamicStateCount = static_cast(dynStates.size()), .pDynamicStates = dynStates.data()}; + + // Pipeline layout: single set (combined image sampler) + push constants for exposure/gamma/srgb flag + vk::DescriptorSetLayout setLayouts[] = {*transparentDescriptorSetLayout}; + vk::PushConstantRange pushRange{.stageFlags = vk::ShaderStageFlagBits::eFragment, .offset = 0, .size = 16}; // matches struct Push in composite.slang + vk::PipelineLayoutCreateInfo plInfo{.setLayoutCount = 1, .pSetLayouts = setLayouts, .pushConstantRangeCount = 1, .pPushConstantRanges = &pushRange}; + compositePipelineLayout = vk::raii::PipelineLayout(device, plInfo); + + // Dynamic rendering info + compositePipelineRenderingCreateInfo = vk::PipelineRenderingCreateInfo{ + + .colorAttachmentCount = 1, + .pColorAttachmentFormats = &swapChainImageFormat, + .depthAttachmentFormat = vk::Format::eUndefined, + .stencilAttachmentFormat = vk::Format::eUndefined + }; + + vk::GraphicsPipelineCreateInfo pipeInfo{ + + .pNext = &compositePipelineRenderingCreateInfo, + .stageCount = 2, + .pStages = stages, + .pVertexInputState = &vertexInput, + .pInputAssemblyState = &inputAssembly, + .pViewportState = &viewportState, + .pRasterizationState = &rasterizer, + .pMultisampleState = &multisampling, + .pDepthStencilState = &depthStencil, + .pColorBlendState = &colorBlending, + .pDynamicState = &dynamicState, + .layout = *compositePipelineLayout, + .renderPass = nullptr, + .subpass = 0 + }; + + compositePipeline = vk::raii::Pipeline(device, nullptr, pipeInfo); + return true; + } catch (const std::exception& e) { + std::cerr << "Failed to create composite pipeline: " << e.what() << std::endl; + return false; + } +} + +// Create Depth Pre-pass pipeline (depth-only) +bool Renderer::createDepthPrepassPipeline() { + try { + // Use the same descriptor set layout and pipeline layout as PBR for UBOs and instancing + if (*pbrDescriptorSetLayout == nullptr || *pbrPipelineLayout == nullptr) { + if (!createPBRPipeline()) { + return false; + } + } + + // Read PBR shader (vertex only) + auto shaderCode = readFile("shaders/pbr.spv"); + vk::raii::ShaderModule shaderModule = createShaderModule(shaderCode); + + // Stages: Vertex only + vk::PipelineShaderStageCreateInfo vertStage{ + .stage = vk::ShaderStageFlagBits::eVertex, + .module = *shaderModule, + .pName = "VSMain" + }; + + // Vertex/instance bindings & attributes same as PBR + auto vertexBindingDescription = Vertex::getBindingDescription(); + auto instanceBindingDescription = InstanceData::getBindingDescription(); + std::array bindingDescriptions = { + vertexBindingDescription, + instanceBindingDescription + }; + + auto vertexAttributeDescriptions = Vertex::getAttributeDescriptions(); + auto instanceModelMatrixAttributes = InstanceData::getModelMatrixAttributeDescriptions(); + auto instanceNormalMatrixAttributes = InstanceData::getNormalMatrixAttributeDescriptions(); + std::vector allAttributes; + allAttributes.insert(allAttributes.end(), vertexAttributeDescriptions.begin(), vertexAttributeDescriptions.end()); + allAttributes.insert(allAttributes.end(), instanceModelMatrixAttributes.begin(), instanceModelMatrixAttributes.end()); + allAttributes.insert(allAttributes.end(), instanceNormalMatrixAttributes.begin(), instanceNormalMatrixAttributes.end()); + + vk::PipelineVertexInputStateCreateInfo vertexInputInfo{ + .vertexBindingDescriptionCount = static_cast(bindingDescriptions.size()), + .pVertexBindingDescriptions = bindingDescriptions.data(), + .vertexAttributeDescriptionCount = static_cast(allAttributes.size()), + .pVertexAttributeDescriptions = allAttributes.data() + }; + + vk::PipelineInputAssemblyStateCreateInfo inputAssembly{ + .topology = vk::PrimitiveTopology::eTriangleList, + .primitiveRestartEnable = VK_FALSE + }; + + // Dummy viewport/scissor (dynamic) + vk::PipelineViewportStateCreateInfo viewportState{ + .viewportCount = 1, + .scissorCount = 1 + }; + + vk::PipelineRasterizationStateCreateInfo rasterizer{ + .depthClampEnable = VK_FALSE, + .rasterizerDiscardEnable = VK_FALSE, + .polygonMode = vk::PolygonMode::eFill, + .cullMode = vk::CullModeFlagBits::eBack, + .frontFace = vk::FrontFace::eCounterClockwise, + .depthBiasEnable = VK_FALSE, + .lineWidth = 1.0f + }; + + vk::PipelineMultisampleStateCreateInfo multisampling{ + .rasterizationSamples = vk::SampleCountFlagBits::e1 + }; + + vk::PipelineDepthStencilStateCreateInfo depthStencil{ + .depthTestEnable = VK_TRUE, + .depthWriteEnable = VK_TRUE, + .depthCompareOp = vk::CompareOp::eLessOrEqual, + .depthBoundsTestEnable = VK_FALSE, + .stencilTestEnable = VK_FALSE + }; + + // No color attachments + vk::PipelineColorBlendStateCreateInfo colorBlending{ + .logicOpEnable = VK_FALSE, + .attachmentCount = 0, + .pAttachments = nullptr + }; + + std::array dynamicStates = {vk::DynamicState::eViewport, vk::DynamicState::eScissor}; + vk::PipelineDynamicStateCreateInfo dynamicState{ + .dynamicStateCount = static_cast(dynamicStates.size()), + .pDynamicStates = dynamicStates.data() + }; + + vk::Format depthFormat = findDepthFormat(); + vk::PipelineRenderingCreateInfo renderingInfo{ + .colorAttachmentCount = 0, + .pColorAttachmentFormats = nullptr, + .depthAttachmentFormat = depthFormat + }; + + vk::GraphicsPipelineCreateInfo pipelineInfo{ + .pNext = &renderingInfo, + .stageCount = 1, + .pStages = &vertStage, + .pVertexInputState = &vertexInputInfo, + .pInputAssemblyState = &inputAssembly, + .pViewportState = &viewportState, + .pRasterizationState = &rasterizer, + .pMultisampleState = &multisampling, + .pDepthStencilState = &depthStencil, + .pColorBlendState = &colorBlending, + .pDynamicState = &dynamicState, + .layout = *pbrPipelineLayout + }; + + depthPrepassPipeline = vk::raii::Pipeline(device, nullptr, pipelineInfo); + return true; + } catch (const std::exception& e) { + std::cerr << "Failed to create depth pre-pass pipeline: " << e.what() << std::endl; + return false; + } +} + +// Create a lighting pipeline +bool Renderer::createLightingPipeline() { + try { + // Read shader code + auto shaderCode = readFile("shaders/lighting.spv"); + + // Create shader modules + vk::raii::ShaderModule shaderModule = createShaderModule(shaderCode); + + // Create shader stage info + vk::PipelineShaderStageCreateInfo vertShaderStageInfo{ + .stage = vk::ShaderStageFlagBits::eVertex, + .module = *shaderModule, + .pName = "VSMain" + }; + + vk::PipelineShaderStageCreateInfo fragShaderStageInfo{ + .stage = vk::ShaderStageFlagBits::eFragment, + .module = *shaderModule, + .pName = "PSMain" + }; + + vk::PipelineShaderStageCreateInfo shaderStages[] = {vertShaderStageInfo, fragShaderStageInfo}; + + // Create vertex input info + auto bindingDescription = Vertex::getBindingDescription(); + auto attributeDescriptions = Vertex::getAttributeDescriptions(); + + vk::PipelineVertexInputStateCreateInfo vertexInputInfo{ + .vertexBindingDescriptionCount = 1, + .pVertexBindingDescriptions = &bindingDescription, + .vertexAttributeDescriptionCount = static_cast(attributeDescriptions.size()), + .pVertexAttributeDescriptions = attributeDescriptions.data() + }; + + // Create input assembly info + vk::PipelineInputAssemblyStateCreateInfo inputAssembly{ + .topology = vk::PrimitiveTopology::eTriangleList, + .primitiveRestartEnable = VK_FALSE + }; + + // Create viewport state info + vk::PipelineViewportStateCreateInfo viewportState{ + .viewportCount = 1, + .scissorCount = 1 + }; + + // Create rasterization state info + vk::PipelineRasterizationStateCreateInfo rasterizer{ + .depthClampEnable = VK_FALSE, + .rasterizerDiscardEnable = VK_FALSE, + .polygonMode = vk::PolygonMode::eFill, + .cullMode = vk::CullModeFlagBits::eNone, + .frontFace = vk::FrontFace::eCounterClockwise, + .depthBiasEnable = VK_FALSE, + .lineWidth = 1.0f + }; + + // Create multisample state info + vk::PipelineMultisampleStateCreateInfo multisampling{ + .rasterizationSamples = vk::SampleCountFlagBits::e1, + .sampleShadingEnable = VK_FALSE + }; + + // Create depth stencil state info + vk::PipelineDepthStencilStateCreateInfo depthStencil{ + .depthTestEnable = VK_TRUE, + .depthWriteEnable = VK_TRUE, + .depthCompareOp = vk::CompareOp::eLess, + .depthBoundsTestEnable = VK_FALSE, + .stencilTestEnable = VK_FALSE + }; + + // Create a color blend attachment state + vk::PipelineColorBlendAttachmentState colorBlendAttachment{ + .blendEnable = VK_TRUE, + .srcColorBlendFactor = vk::BlendFactor::eSrcAlpha, + .dstColorBlendFactor = vk::BlendFactor::eOneMinusSrcAlpha, + .colorBlendOp = vk::BlendOp::eAdd, + .srcAlphaBlendFactor = vk::BlendFactor::eOne, + .dstAlphaBlendFactor = vk::BlendFactor::eZero, + .alphaBlendOp = vk::BlendOp::eAdd, + .colorWriteMask = vk::ColorComponentFlagBits::eR | vk::ColorComponentFlagBits::eG | vk::ColorComponentFlagBits::eB | vk::ColorComponentFlagBits::eA + }; + + // Create color blend state info + vk::PipelineColorBlendStateCreateInfo colorBlending{ + .logicOpEnable = VK_FALSE, + .logicOp = vk::LogicOp::eCopy, + .attachmentCount = 1, + .pAttachments = &colorBlendAttachment + }; + + // Create dynamic state info + std::vector dynamicStates = { + vk::DynamicState::eViewport, + vk::DynamicState::eScissor + }; + + vk::PipelineDynamicStateCreateInfo dynamicState{ + .dynamicStateCount = static_cast(dynamicStates.size()), + .pDynamicStates = dynamicStates.data() + }; + + // Create push constant range for material properties + vk::PushConstantRange pushConstantRange{ + .stageFlags = vk::ShaderStageFlagBits::eFragment, + .offset = 0, + .size = sizeof(MaterialProperties) + }; + + // Create pipeline layout + vk::PipelineLayoutCreateInfo pipelineLayoutInfo{ + .setLayoutCount = 1, + .pSetLayouts = &*descriptorSetLayout, + .pushConstantRangeCount = 1, + .pPushConstantRanges = &pushConstantRange + }; + + lightingPipelineLayout = vk::raii::PipelineLayout(device, pipelineLayoutInfo); + + // Create pipeline rendering info + vk::Format depthFormat = findDepthFormat(); + + // Initialize member variable for proper lifetime management + lightingPipelineRenderingCreateInfo = vk::PipelineRenderingCreateInfo{ + + .colorAttachmentCount = 1, + .pColorAttachmentFormats = &swapChainImageFormat, + .depthAttachmentFormat = depthFormat, + .stencilAttachmentFormat = vk::Format::eUndefined + }; + + // Create a graphics pipeline + vk::PipelineRasterizationStateCreateInfo rasterizerBack = rasterizer; + rasterizerBack.cullMode = vk::CullModeFlagBits::eBack; + + vk::GraphicsPipelineCreateInfo pipelineInfo{ + + .pNext = &lightingPipelineRenderingCreateInfo, + .flags = vk::PipelineCreateFlags{}, + .stageCount = 2, + .pStages = shaderStages, + .pVertexInputState = &vertexInputInfo, + .pInputAssemblyState = &inputAssembly, + .pViewportState = &viewportState, + .pRasterizationState = &rasterizerBack, + .pMultisampleState = &multisampling, + .pDepthStencilState = &depthStencil, + .pColorBlendState = &colorBlending, + .pDynamicState = &dynamicState, + .layout = *lightingPipelineLayout, + .renderPass = nullptr, + .subpass = 0, + .basePipelineHandle = nullptr, + .basePipelineIndex = -1 + }; + + lightingPipeline = vk::raii::Pipeline(device, nullptr, pipelineInfo); + return true; + } catch (const std::exception& e) { + std::cerr << "Failed to create lighting pipeline: " << e.what() << std::endl; + return false; + } +} + +// Push material properties to the pipeline +void Renderer::pushMaterialProperties(vk::CommandBuffer commandBuffer, const MaterialProperties& material) const { + commandBuffer.pushConstants(*pbrPipelineLayout, vk::ShaderStageFlagBits::eFragment, 0, sizeof(MaterialProperties), &material); +} + +bool Renderer::createRayQueryDescriptorSetLayout() { + // Production layout: 7 bindings (0..6), no debug buffer at 7 + std::array bindings{}; + + // Binding 0: UBO (UniformBufferObject) + bindings[0].binding = 0; + bindings[0].descriptorType = vk::DescriptorType::eUniformBuffer; + bindings[0].descriptorCount = 1; + bindings[0].stageFlags = vk::ShaderStageFlagBits::eCompute; + + // Binding 1: TLAS (Top-Level Acceleration Structure) + bindings[1].binding = 1; + bindings[1].descriptorType = vk::DescriptorType::eAccelerationStructureKHR; + bindings[1].descriptorCount = 1; + bindings[1].stageFlags = vk::ShaderStageFlagBits::eCompute; + + // Binding 2: Output image (storage image) + bindings[2].binding = 2; + bindings[2].descriptorType = vk::DescriptorType::eStorageImage; + bindings[2].descriptorCount = 1; + bindings[2].stageFlags = vk::ShaderStageFlagBits::eCompute; + + // Binding 3: Light buffer (storage buffer) + bindings[3].binding = 3; + bindings[3].descriptorType = vk::DescriptorType::eStorageBuffer; + bindings[3].descriptorCount = 1; + bindings[3].stageFlags = vk::ShaderStageFlagBits::eCompute; + + // Binding 4: Geometry info buffer (maps BLAS geometry index to vertex/index buffer addresses) + bindings[4].binding = 4; + bindings[4].descriptorType = vk::DescriptorType::eStorageBuffer; + bindings[4].descriptorCount = 1; + bindings[4].stageFlags = vk::ShaderStageFlagBits::eCompute; + + // Binding 5: Material buffer (array of material properties) + bindings[5].binding = 5; + bindings[5].descriptorType = vk::DescriptorType::eStorageBuffer; + bindings[5].descriptorCount = 1; + bindings[5].stageFlags = vk::ShaderStageFlagBits::eCompute; + + // Binding 6: BaseColor textures array (combined image samplers) + bindings[6].binding = 6; + bindings[6].descriptorType = vk::DescriptorType::eCombinedImageSampler; + bindings[6].descriptorCount = RQ_MAX_TEX; // large static array + bindings[6].stageFlags = vk::ShaderStageFlagBits::eCompute; + + // Descriptor indexing / update-after-bind support: + // The ray query shader indexes a large `eCombinedImageSampler` array with a per-pixel varying index. + // On some drivers this requires descriptor indexing features + layout binding flags to avoid the + // array collapsing to slot 0 (resulting in "no textures" even when `texIndex>0`). + std::array bindingFlags{}; + if (descriptorIndexingEnabled) { + // Binding 6 is the large sampled texture array. + bindingFlags[6] = vk::DescriptorBindingFlagBits::eUpdateAfterBind | + vk::DescriptorBindingFlagBits::eUpdateUnusedWhilePending | + vk::DescriptorBindingFlagBits::ePartiallyBound; + } + + vk::DescriptorSetLayoutBindingFlagsCreateInfo bindingFlagsInfo{}; + if (descriptorIndexingEnabled) { + bindingFlagsInfo.bindingCount = static_cast(bindingFlags.size()); + bindingFlagsInfo.pBindingFlags = bindingFlags.data(); + } + + vk::DescriptorSetLayoutCreateInfo layoutInfo{}; + if (descriptorIndexingEnabled) { + layoutInfo.pNext = &bindingFlagsInfo; + layoutInfo.flags = vk::DescriptorSetLayoutCreateFlagBits::eUpdateAfterBindPool; + } + layoutInfo.bindingCount = static_cast(bindings.size()); + layoutInfo.pBindings = bindings.data(); + + try { + rayQueryDescriptorSetLayout = vk::raii::DescriptorSetLayout(device, layoutInfo); + return true; + } catch (const std::exception& e) { + std::cerr << "Failed to create ray query descriptor set layout: " << e.what() << std::endl; + return false; + } +} + +bool Renderer::createRayQueryPipeline() { + // Check if ray query is supported on this device + if (!rayQueryEnabled || !accelerationStructureEnabled) { + std::cout << "Ray query rendering not available on this device (missing VK_KHR_ray_query or VK_KHR_acceleration_structure support)\n"; + return true; // Not an error - just skip ray query pipeline creation + } + + // Load compiled shader module + auto shaderCode = readFile("shaders/ray_query.spv"); + if (shaderCode.empty()) { + std::cerr << "Failed to load ray query shader\n"; + return false; + } + + vk::ShaderModuleCreateInfo createInfo{}; + createInfo.codeSize = shaderCode.size(); + createInfo.pCode = reinterpret_cast(shaderCode.data()); + + vk::raii::ShaderModule shaderModule(device, createInfo); + + vk::PipelineShaderStageCreateInfo shaderStage{}; + shaderStage.stage = vk::ShaderStageFlagBits::eCompute; + shaderStage.module = *shaderModule; + shaderStage.pName = "main"; + + // Create pipeline layout + vk::PipelineLayoutCreateInfo pipelineLayoutInfo{}; + pipelineLayoutInfo.setLayoutCount = 1; + pipelineLayoutInfo.pSetLayouts = &(*rayQueryDescriptorSetLayout); + + rayQueryPipelineLayout = vk::raii::PipelineLayout(device, pipelineLayoutInfo); + + // Create compute pipeline + vk::ComputePipelineCreateInfo pipelineInfo{}; + pipelineInfo.stage = shaderStage; + pipelineInfo.layout = *rayQueryPipelineLayout; + + try { + rayQueryPipeline = vk::raii::Pipeline(device, nullptr, pipelineInfo); + return true; + } catch (const std::exception& e) { + std::cerr << "Failed to create ray query pipeline: " << e.what() << std::endl; + return false; + } +} + +bool Renderer::createRayQueryResources() { + try { + // Create output image using memory pool (storage image for compute shader) + // Use an HDR-capable format for Ray Query so PBR lighting can accumulate in linear space + // before composite applies exposure/gamma. + // Fall back to R8G8B8A8_UNORM if the device does not support storage-image usage. + vk::Format rqFormat = vk::Format::eR16G16B16A16Sfloat; { + auto props = physicalDevice.getFormatProperties(rqFormat); + if (!(props.optimalTilingFeatures & vk::FormatFeatureFlagBits::eStorageImage)) { + rqFormat = vk::Format::eR8G8B8A8Unorm; + } + } + auto [image, allocation] = memoryPool->createImage( + swapChainExtent.width, + swapChainExtent.height, + rqFormat, + vk::ImageTiling::eOptimal, + vk::ImageUsageFlagBits::eStorage | vk::ImageUsageFlagBits::eTransferSrc | vk::ImageUsageFlagBits::eSampled, + vk::MemoryPropertyFlagBits::eDeviceLocal, + 1, + // mipLevels + vk::SharingMode::eExclusive, + {} // queueFamilies + ); + + rayQueryOutputImage = std::move(image); + rayQueryOutputImageAllocation = std::move(allocation); + + // Create image view + vk::ImageViewCreateInfo viewInfo{}; + viewInfo.image = *rayQueryOutputImage; + viewInfo.viewType = vk::ImageViewType::e2D; + viewInfo.format = rqFormat; + viewInfo.subresourceRange.aspectMask = vk::ImageAspectFlagBits::eColor; + viewInfo.subresourceRange.baseMipLevel = 0; + viewInfo.subresourceRange.levelCount = 1; + viewInfo.subresourceRange.baseArrayLayer = 0; + viewInfo.subresourceRange.layerCount = 1; + + rayQueryOutputImageView = vk::raii::ImageView(device, viewInfo); + + // Transition output image to GENERAL layout for compute shader writes + transitionImageLayout(*rayQueryOutputImage, + rqFormat, + vk::ImageLayout::eUndefined, + vk::ImageLayout::eGeneral, + 1); + + // Allocate descriptor sets (one per frame in flight) + std::vector layouts(MAX_FRAMES_IN_FLIGHT, *rayQueryDescriptorSetLayout); + vk::DescriptorSetAllocateInfo allocInfo{}; + allocInfo.descriptorPool = *descriptorPool; + allocInfo.descriptorSetCount = MAX_FRAMES_IN_FLIGHT; + allocInfo.pSetLayouts = layouts.data(); + + // Allocate into a temporary owning container, then move the individual RAII sets into our vector. + // (Avoid assigning `vk::raii::DescriptorSets` directly into `std::vector`.) + { + auto sets = vk::raii::DescriptorSets(device, allocInfo); + rayQueryDescriptorSets.clear(); + rayQueryDescriptorSets.reserve(sets.size()); + for (auto& s : sets) { + rayQueryDescriptorSets.emplace_back(std::move(s)); + } + } + + // Create descriptor sets for composite pass to sample the rayQueryOutputImage + // Reuse the transparentDescriptorSetLayout (binding 0 = combined image sampler) + if (*transparentDescriptorSetLayout == nullptr) { + // Ensure it exists (created by PBR path); + createPBRPipeline(); + } + if (*transparentDescriptorSetLayout != nullptr) { + // Ensure we have a valid sampler for sampling the ray-query output image + if (*rqCompositeSampler == nullptr) { + vk::SamplerCreateInfo sci{ + .magFilter = vk::Filter::eLinear, + .minFilter = vk::Filter::eLinear, + .mipmapMode = vk::SamplerMipmapMode::eNearest, + .addressModeU = vk::SamplerAddressMode::eClampToEdge, + .addressModeV = vk::SamplerAddressMode::eClampToEdge, + .addressModeW = vk::SamplerAddressMode::eClampToEdge, + .mipLodBias = 0.0f, + .anisotropyEnable = VK_FALSE, + .maxAnisotropy = 1.0f, + .compareEnable = VK_FALSE, + .compareOp = vk::CompareOp::eAlways, + .minLod = 0.0f, + .maxLod = 0.0f, + .borderColor = vk::BorderColor::eIntOpaqueBlack, + .unnormalizedCoordinates = VK_FALSE + }; + rqCompositeSampler = vk::raii::Sampler(device, sci); + } + std::vector rqLayouts(MAX_FRAMES_IN_FLIGHT, *transparentDescriptorSetLayout); + vk::DescriptorSetAllocateInfo rqAllocInfo{ + .descriptorPool = *descriptorPool, + .descriptorSetCount = MAX_FRAMES_IN_FLIGHT, + .pSetLayouts = rqLayouts.data() + }; { + auto sets = vk::raii::DescriptorSets(device, rqAllocInfo); + rqCompositeDescriptorSets.clear(); + rqCompositeDescriptorSets.reserve(sets.size()); + for (auto& s : sets) { + rqCompositeDescriptorSets.emplace_back(std::move(s)); + } + } + + // Update each set to sample the rayQueryOutputImage + for (size_t i = 0; i < rqCompositeDescriptorSets.size(); ++i) { + // Use a dedicated sampler to avoid null sampler issues during early init + vk::Sampler samplerHandle = *rqCompositeSampler; + vk::DescriptorImageInfo imgInfo{ + .sampler = samplerHandle, + .imageView = *rayQueryOutputImageView, + .imageLayout = vk::ImageLayout::eShaderReadOnlyOptimal + }; + vk::WriteDescriptorSet write{ + .dstSet = *rqCompositeDescriptorSets[i], + .dstBinding = 0, + .dstArrayElement = 0, + .descriptorCount = 1, + .descriptorType = vk::DescriptorType::eCombinedImageSampler, + .pImageInfo = &imgInfo + }; + device.updateDescriptorSets({write}, {}); + } + } + + // Create dedicated UBO buffers for ray query (one per frame in flight) + rayQueryUniformBuffers.clear(); + rayQueryUniformAllocations.clear(); + rayQueryUniformBuffersMapped.clear(); + + for (size_t i = 0; i < MAX_FRAMES_IN_FLIGHT; i++) { + auto [uboBuffer, uboAlloc] = createBufferPooled( + sizeof(RayQueryUniformBufferObject), + vk::BufferUsageFlagBits::eUniformBuffer, + vk::MemoryPropertyFlagBits::eHostVisible | vk::MemoryPropertyFlagBits::eHostCoherent); + + rayQueryUniformBuffers.push_back(std::move(uboBuffer)); + rayQueryUniformAllocations.push_back(std::move(uboAlloc)); + rayQueryUniformBuffersMapped.push_back(rayQueryUniformAllocations.back()->mappedPtr); + } + + std::cout << "Ray query resources created successfully (including " << MAX_FRAMES_IN_FLIGHT << " dedicated UBOs)\n"; + return true; + } catch (const std::exception& e) { + std::cerr << "Failed to create ray query resources: " << e.what() << std::endl; + return false; + } +} diff --git a/attachments/sync2_engine/renderer_ray_query.cpp b/attachments/sync2_engine/renderer_ray_query.cpp new file mode 100644 index 00000000..711109fd --- /dev/null +++ b/attachments/sync2_engine/renderer_ray_query.cpp @@ -0,0 +1,1748 @@ +/* Copyright (c) 2025 Holochip Corporation + * + * SPDX-License-Identifier: Apache-2.0 + * + * Licensed under the Apache License, Version 2.0 the "License"; + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +#include "entity.h" +#include "mesh_component.h" +#include "renderer.h" +#include "transform_component.h" +#include +#include +#include +#include +#include +#include + +// Helper function to get buffer device address +vk::DeviceAddress getBufferDeviceAddress(const vk::raii::Device& device, vk::Buffer buffer) { + vk::BufferDeviceAddressInfo addressInfo{}; + addressInfo.buffer = buffer; + return device.getBufferAddress(addressInfo); +} + +/** + * @brief Build acceleration structures for ray query rendering. + * + * Builds BLAS for each unique mesh and a TLAS for the entire scene. + * + * @param entities The entities to include in the acceleration structures. + * @return True if successful, false otherwise. + */ +bool Renderer::buildAccelerationStructures(const std::vector& entities) { + auto asStartTime = std::chrono::steady_clock::now(); + if (!accelerationStructureEnabled || !rayQueryEnabled) { + std::cout << "Acceleration structures not supported on this device\n"; + return false; + } + + // Large scenes (Bistro) take seconds/minutes to build in Debug. Suppress watchdog + // so we don't abort, but we'll still update progress markers for user feedback. + ScopedWatchdogSuppression watchdogGuard(this); + watchdogProgressLabel.store("AS: buildAccelerationStructures start", std::memory_order_relaxed); + + try { + const auto asStartCpu = std::chrono::steady_clock::now(); + + // --- UI progress instrumentation (for long AS builds) --- + // We update these frequently during BLAS/TLAS builds so the loading overlay + // can display meaningful progress if the build takes > ~10 seconds. + auto nowNs = []() -> uint64_t { + return static_cast( + std::chrono::duration_cast( + std::chrono::steady_clock::now().time_since_epoch()) + .count()); + }; + auto setASUi = [&](bool active, const char* stage, float progress, uint32_t done, uint32_t total) { + asBuildUiActive.store(active, std::memory_order_relaxed); + asBuildUiStage.store(stage ? stage : "", std::memory_order_relaxed); + asBuildUiProgress.store(std::clamp(progress, 0.0f, 1.0f), std::memory_order_relaxed); + asBuildUiDone.store(done, std::memory_order_relaxed); + asBuildUiTotal.store(total, std::memory_order_relaxed); + // Also drive the main loading overlay progress while we're in the AS phase. + if (GetLoadingPhase() == LoadingPhase::AccelerationStructures) { + SetLoadingPhaseProgress(progress); + } + }; + // Start timer if not already running + if (asBuildUiStartNs.load(std::memory_order_relaxed) == 0) { + asBuildUiStartNs.store(nowNs(), std::memory_order_relaxed); + } + setASUi(true, "AS: prepare", 0.0f, 0u, 0u); + struct ASBuildUiGuard { + Renderer* r; + explicit ASBuildUiGuard(Renderer* rr) : r(rr) { + } + ~ASBuildUiGuard() { + if (!r) + return; + r->asBuildUiActive.store(false, std::memory_order_relaxed); + r->asBuildUiStage.store("idle", std::memory_order_relaxed); + r->asBuildUiProgress.store(0.0f, std::memory_order_relaxed); + r->asBuildUiDone.store(0u, std::memory_order_relaxed); + r->asBuildUiTotal.store(0u, std::memory_order_relaxed); + r->asBuildUiStartNs.store(0u, std::memory_order_relaxed); + } + } asUiGuard(this); + + // Large scenes can take seconds to build BLAS/TLAS. Keep the watchdog alive while we work. + auto lastKick = std::chrono::steady_clock::now(); + auto kickWatchdog = [&](bool force = false) { + auto now = std::chrono::steady_clock::now(); + if (force || now - lastKick > std::chrono::milliseconds(200)) { + lastFrameUpdateTime.store(now, std::memory_order_relaxed); + lastKick = now; + } + }; + kickWatchdog(true); + + std::cout << "Building acceleration structures for " << entities.size() << " entities..." << std::endl; + + // PRECHECK: Determine how many renderable entities and unique meshes are READY right now. + // If the counts would shrink compared to the last successful build (e.g., streaming not done), + // skip rebuilding to avoid producing a TLAS that only contains a small subset (like animated fans). + size_t readyRenderableCount = 0; + size_t readyUniqueMeshCount = 0; { + std::shared_lock meshLock(meshResourcesMutex); + size_t skippedInactive = 0; + size_t skippedNoMesh = 0; + size_t skippedNoRes = 0; + size_t skippedException = 0; + + std::map meshToBLASProbe; + uint32_t processed = 0; + for (Entity* entity : entities) { + if (++processed % 100 == 0) kickWatchdog(); + if (!entity || !entity->IsActive()) { + skippedInactive++; + continue; + } + auto meshComp = entity->GetComponent(); + if (!meshComp) { + skippedNoMesh++; + continue; + } + + try { + auto meshIt = meshResources.find(meshComp); + if (meshIt == meshResources.end()) { + skippedNoRes++; + continue; + } + } catch (...) { + skippedException++; + continue; + } + + readyRenderableCount++; + if (meshToBLASProbe.find(meshComp) == meshToBLASProbe.end()) { + meshToBLASProbe[meshComp] = static_cast(meshToBLASProbe.size()); + } + } + readyUniqueMeshCount = meshToBLASProbe.size(); + + // Keep this precheck quiet; any meaningful summary is printed in the main AS build block below. + (void) skippedInactive; + (void) skippedNoMesh; + (void) skippedNoRes; + (void) skippedException; + } + + if (readyRenderableCount == 0 || readyUniqueMeshCount == 0) { + std::cout << "AS build skipped: no ready meshes yet (renderables=" << readyRenderableCount + << ", uniqueMeshes=" << readyUniqueMeshCount << ")\n"; + return false; + } + + // Move old AS structures to pending deletion queue + // They will be deleted once the GPU has finished using them according to the timeline. + if (!blasStructures.empty() || *tlasStructure.handle) { + PendingASDelete pendingDelete; + pendingDelete.blasStructures = std::move(blasStructures); + pendingDelete.tlasStructure = std::move(tlasStructure); + pendingDelete.timelineValue = currentTimelineValue; + pendingASDeletions.push_back(std::move(pendingDelete)); + } + + // Clear the moved-from containers (they're now empty) + blasStructures.clear(); + tlasStructure = AccelerationStructure{}; + + // Map mesh components to BLAS indices + std::map meshToBLAS; + std::vector uniqueMeshes; + + // Collect unique meshes and entities + std::vector renderableEntities; + auto containsCaseInsensitive = [](const std::string& haystack, const std::string& needle) -> bool { + std::string h = haystack; + std::string n = needle; + std::transform(h.begin(), h.end(), h.begin(), [](unsigned char c) { return std::tolower(c); }); + std::transform(n.begin(), n.end(), n.begin(), [](unsigned char c) { return std::tolower(c); }); + return h.find(n) != std::string::npos; + }; + + // Collect renderable entities for AS build without spamming logs. + size_t skippedInactive = 0; + size_t skippedNoMesh = 0; + size_t skippedNoRes = 0; + size_t skippedPendingUploads = 0; + size_t skippedNullBuffers = 0; + size_t skippedZeroIndices = 0; + size_t skippedException = 0; + + uint32_t processedCount = 0; + for (Entity* entity : entities) { + if (++processedCount % 100 == 0) kickWatchdog(); + if (!entity || !entity->IsActive()) { + skippedInactive++; + continue; + } + + auto meshComp = entity->GetComponent(); + if (!meshComp) { + skippedNoMesh++; + continue; + } + + // Safely check if mesh resources exist - catch any exceptions from dereferencing potentially stale pointers + try { + auto meshIt = meshResources.find(meshComp); + if (meshIt == meshResources.end()) { + skippedNoRes++; + continue; + } + + // Validate that the mesh resources have valid buffers before adding to AS build + const auto& meshRes = meshIt->second; + // Only include when uploads finished (staging sizes are zero) + if (meshRes.vertexBufferSizeBytes != 0 || meshRes.indexBufferSizeBytes != 0) { + // Skip meshes still uploading to avoid partial TLAS builds + skippedPendingUploads++; + continue; + } + // RAII handles: check if they contain valid Vulkan handles by dereferencing + if (!*meshRes.vertexBuffer || !*meshRes.indexBuffer) { + skippedNullBuffers++; + continue; + } + + if (meshRes.indexCount == 0) { + skippedZeroIndices++; + continue; + } + } catch (const std::exception&) { + // Avoid spamming; a rebuild on the next safe frame should succeed. + skippedException++; + continue; + } + + renderableEntities.push_back(entity); + + if (meshToBLAS.find(meshComp) == meshToBLAS.end()) { + meshToBLAS[meshComp] = static_cast(uniqueMeshes.size()); + uniqueMeshes.push_back(meshComp); + } + } + + if (uniqueMeshes.empty()) { + // Nothing ready yet (e.g., mesh uploads still pending). Treat as a transient + // condition so the caller can retry next frame without clearing the request. + setASUi(true, "AS: waiting on meshes", 0.0f, 0u, 0u); + return false; + } + + // One concise build summary (no per-entity spam) + std::cout << "Building AS: uniqueMeshes=" << uniqueMeshes.size() + << ", entities=" << renderableEntities.size() + << " (skipped inactive=" << skippedInactive + << ", noMesh=" << skippedNoMesh + << ", noRes=" << skippedNoRes + << ", pendingUploads=" << skippedPendingUploads + << ", nullBuffers=" << skippedNullBuffers + << ", zeroIndices=" << skippedZeroIndices + << ", exception=" << skippedException + << ")\n"; + + // Create a dedicated command pool for AS building to avoid threading issues + // The main commandPool may be in use by the render thread + vk::CommandPoolCreateInfo poolInfo{}; + poolInfo.flags = vk::CommandPoolCreateFlagBits::eTransient; + poolInfo.queueFamilyIndex = queueFamilyIndices.graphicsFamily.value(); + + vk::raii::CommandPool asBuildCommandPool(device, poolInfo); + + // Create command buffer for AS building + vk::CommandBufferAllocateInfo allocInfo{}; + allocInfo.commandPool = *asBuildCommandPool; + allocInfo.level = vk::CommandBufferLevel::ePrimary; + allocInfo.commandBufferCount = 1; + + vk::raii::CommandBuffers cmdBuffers(device, allocInfo); + vk::raii::CommandBuffer& cmdBuffer = cmdBuffers[0]; + + cmdBuffer.begin(vk::CommandBufferBeginInfo{ + .flags = vk::CommandBufferUsageFlagBits::eOneTimeSubmit + }); + + // (Vespa-only debugging removed; keep logs quiet.) + + // Build BLAS for each unique mesh + blasStructures.resize(uniqueMeshes.size()); + + // Progress model: BLAS phase dominates. Treat TLAS + post buffers as a few extra steps. + const uint32_t totalSteps = static_cast(uniqueMeshes.size()) + 3u; + setASUi(true, "AS: build BLAS", 0.0f, 0u, totalSteps); + + // Keep scratch buffers alive until GPU execution completes (after fence wait) + // Destroying them early causes "VkBuffer was destroy" validation errors and crashes + std::vector scratchBuffers; + std::vector> scratchAllocations; + + // Batch build BLAS structures in smaller chunks to avoid TDR and excessive scratch allocation + const size_t blasBatchSize = 128; + for (size_t batchStart = 0; batchStart < uniqueMeshes.size(); batchStart += blasBatchSize) { + size_t batchEnd = std::min(batchStart + blasBatchSize, uniqueMeshes.size()); + size_t currentBatchCount = batchEnd - batchStart; + + std::vector blasBuildInfos; + std::vector blasGeometries; + std::vector blasRangeInfos; + blasBuildInfos.reserve(currentBatchCount); + blasGeometries.reserve(currentBatchCount); + blasRangeInfos.reserve(currentBatchCount); + + vk::DeviceSize totalBatchScratchSize = 0; + std::vector batchScratchOffsets; + batchScratchOffsets.reserve(currentBatchCount); + + // Alignment requirement for scratch data + const vk::DeviceSize scratchAlignment = std::max( + accelStructProperties.minAccelerationStructureScratchOffsetAlignment, 128); + + for (size_t i = batchStart; i < batchEnd; ++i) { + if (i % 100 == 0) kickWatchdog(); + // Update UI progress (BLAS) + setASUi(true, + "AS: build BLAS", + totalSteps > 0 ? static_cast(static_cast(i)) / static_cast(totalSteps) : 0.0f, + static_cast(i), + totalSteps); + + MeshComponent* meshComp = uniqueMeshes[i]; + auto& meshRes = meshResources.at(meshComp); + + vk::DeviceAddress vertexAddress = getBufferDeviceAddress(device, *meshRes.vertexBuffer); + vk::DeviceAddress indexAddress = getBufferDeviceAddress(device, *meshRes.indexBuffer); + const uint32_t vertexCount = static_cast(meshComp->GetVertices().size()); + + vk::AccelerationStructureGeometryKHR geometry{}; + geometry.geometryType = vk::GeometryTypeKHR::eTriangles; + geometry.flags = vk::GeometryFlagBitsKHR::eOpaque; + geometry.geometry.triangles.vertexFormat = vk::Format::eR32G32B32Sfloat; + geometry.geometry.triangles.vertexData = vertexAddress; + geometry.geometry.triangles.vertexStride = sizeof(Vertex); + geometry.geometry.triangles.maxVertex = vertexCount; + geometry.geometry.triangles.indexType = vk::IndexType::eUint32; + geometry.geometry.triangles.indexData = indexAddress; + blasGeometries.push_back(geometry); + + uint32_t primitiveCount = meshRes.indexCount / 3; + + vk::AccelerationStructureBuildGeometryInfoKHR buildInfo{}; + buildInfo.type = vk::AccelerationStructureTypeKHR::eBottomLevel; + buildInfo.flags = vk::BuildAccelerationStructureFlagBitsKHR::ePreferFastTrace; + buildInfo.mode = vk::BuildAccelerationStructureModeKHR::eBuild; + buildInfo.geometryCount = 1; + buildInfo.pGeometries = &blasGeometries.back(); + + vk::AccelerationStructureBuildSizesInfoKHR sizeInfo = device.getAccelerationStructureBuildSizesKHR( + vk::AccelerationStructureBuildTypeKHR::eDevice, + buildInfo, + primitiveCount); + + vk::DeviceSize alignedScratchSize = (sizeInfo.buildScratchSize + scratchAlignment - 1) & ~(scratchAlignment - 1); + batchScratchOffsets.push_back(totalBatchScratchSize); + totalBatchScratchSize += alignedScratchSize; + + auto [blasBuffer, blasAlloc] = createBufferPooled( + sizeInfo.accelerationStructureSize, + vk::BufferUsageFlagBits::eAccelerationStructureStorageKHR | vk::BufferUsageFlagBits::eShaderDeviceAddress, + vk::MemoryPropertyFlagBits::eDeviceLocal); + + vk::AccelerationStructureCreateInfoKHR createInfo{}; + createInfo.buffer = *blasBuffer; + createInfo.size = sizeInfo.accelerationStructureSize; + createInfo.type = vk::AccelerationStructureTypeKHR::eBottomLevel; + + vk::raii::AccelerationStructureKHR blasHandle(device, createInfo); + + buildInfo.dstAccelerationStructure = *blasHandle; + blasBuildInfos.push_back(buildInfo); + + vk::AccelerationStructureBuildRangeInfoKHR rangeInfo{}; + rangeInfo.primitiveCount = primitiveCount; + rangeInfo.primitiveOffset = 0; + rangeInfo.firstVertex = 0; + rangeInfo.transformOffset = 0; + blasRangeInfos.push_back(rangeInfo); + + vk::AccelerationStructureDeviceAddressInfoKHR addressInfo{}; + addressInfo.accelerationStructure = *blasHandle; + vk::DeviceAddress blasAddress = device.getAccelerationStructureAddressKHR(addressInfo); + + blasStructures[i].buffer = std::move(blasBuffer); + blasStructures[i].allocation = std::move(blasAlloc); + blasStructures[i].handle = std::move(blasHandle); + blasStructures[i].deviceAddress = blasAddress; + } + + // Create a single shared scratch buffer for this batch + if (totalBatchScratchSize > 0) { + auto [scratchBuffer, scratchAlloc] = createBufferPooled( + totalBatchScratchSize, + vk::BufferUsageFlagBits::eStorageBuffer | vk::BufferUsageFlagBits::eShaderDeviceAddress, + vk::MemoryPropertyFlagBits::eDeviceLocal); + vk::DeviceAddress scratchBaseAddress = getBufferDeviceAddress(device, *scratchBuffer); + + for (size_t j = 0; j < blasBuildInfos.size(); ++j) { + blasBuildInfos[j].scratchData = scratchBaseAddress + batchScratchOffsets[j]; + } + + std::vector pRangeInfos; + pRangeInfos.reserve(blasRangeInfos.size()); + for (const auto& r : blasRangeInfos) { + pRangeInfos.push_back(&r); + } + cmdBuffer.buildAccelerationStructuresKHR(blasBuildInfos, pRangeInfos); + + // Scratch buffer must stay alive until GPU is done with this batch. + // For simplicity in this one-time build, we'll keep all batch scratch buffers alive + // until the final fence wait at the end of the function. + scratchBuffers.push_back(std::move(scratchBuffer)); + scratchAllocations.push_back(std::move(scratchAlloc)); + } + } + // BLAS done + setASUi(true, + "AS: build TLAS", + totalSteps > 0 ? static_cast(static_cast(uniqueMeshes.size())) / static_cast(totalSteps) : 0.0f, + static_cast(uniqueMeshes.size()), + totalSteps); + + // Barrier between BLAS and TLAS builds + + // Barrier between BLAS and TLAS builds + vk::MemoryBarrier2 barrier{}; + barrier.srcStageMask = vk::PipelineStageFlagBits2::eAccelerationStructureBuildKHR; + barrier.srcAccessMask = vk::AccessFlagBits2::eAccelerationStructureWriteKHR; + barrier.dstStageMask = vk::PipelineStageFlagBits2::eAccelerationStructureBuildKHR; + barrier.dstAccessMask = vk::AccessFlagBits2::eAccelerationStructureReadKHR; + + vk::DependencyInfo depInfo{}; + depInfo.memoryBarrierCount = 1; + depInfo.pMemoryBarriers = &barrier; + cmdBuffer.pipelineBarrier2(depInfo); + + // Build TLAS with instances + // NOTE: many entities are instanced; reserve based on an estimated total instance count. + size_t estimatedInstances = 0; + for (Entity* e : renderableEntities) { + if (!e) continue; + if (auto* mc = e->GetComponent()) { + const size_t c = mc->GetInstanceCount(); + estimatedInstances += (c > 0) ? c : 1; + } else { + estimatedInstances += 1; + } + if (estimatedInstances > 1000000) { + break; // safety + } + } + std::vector instances; + instances.reserve(std::max(renderableEntities.size(), estimatedInstances)); + + // Build per-instance geometry info in the SAME order as TLAS instances + std::vector geometryInfos; // defined later in file; we reuse the type + geometryInfos.reserve(instances.capacity()); + tlasInstanceOrder.clear(); + + // Ray Query texture table (binding 6): seed reserved shared-default slots. + // We will assign per-material texture indices into this table, and the descriptor update + // will resolve each slot to either the streamed texture or a type-appropriate fallback. + rayQueryTexKeys.clear(); + rayQueryTexFallbackSlots.clear(); + rayQueryTexIndex.clear(); + rayQueryTexCount = 0; + + auto seedReservedSlot = [&](uint32_t slot, const std::string& id) { + if (rayQueryTexKeys.size() <= slot) { + rayQueryTexKeys.resize(slot + 1); + rayQueryTexFallbackSlots.resize(slot + 1); + } + const std::string key = ResolveTextureId(id); + rayQueryTexKeys[slot] = key; + rayQueryTexFallbackSlots[slot] = slot; + rayQueryTexIndex[key] = slot; + }; + + seedReservedSlot(RQ_SLOT_DEFAULT_BASECOLOR, SHARED_DEFAULT_ALBEDO_ID); + seedReservedSlot(RQ_SLOT_DEFAULT_NORMAL, SHARED_DEFAULT_NORMAL_ID); + seedReservedSlot(RQ_SLOT_DEFAULT_METALROUGH, SHARED_DEFAULT_METALLIC_ROUGHNESS_ID); + seedReservedSlot(RQ_SLOT_DEFAULT_OCCLUSION, SHARED_DEFAULT_OCCLUSION_ID); + seedReservedSlot(RQ_SLOT_DEFAULT_EMISSIVE, SHARED_DEFAULT_EMISSIVE_ID); + rayQueryTexCount = static_cast(rayQueryTexKeys.size()); + + // Build an authoritative lookup from `materialIndex` -> `Material*`. + // We already embed both the numeric `materialIndex` and the material name in entity names + // (`modelName_Material__`). Use this mapping so TLAS instance flags + // can be set per-instance using the resolved `materialIndex` (critical for MASK/BLEND decals). + std::unordered_map materialByIndex; + + // Per-entity pre-calculated properties to avoid redundant string parsing in hot loops + struct EntityASProperties { + uint32_t materialIndex = 0; + bool isEnvironment = false; + bool forceNoOpaque = false; + bool forceOpaque = false; + const Material* material = nullptr; + }; + std::unordered_map entityASProps; + entityASProps.reserve(renderableEntities.size()); + + if (modelLoader) { + materialByIndex.reserve(renderableEntities.size()); + static constexpr uint32_t kMaxSupportedMaterialIndex = 100000u; + + watchdogProgressLabel.store("AS: pre-calculate entity properties", std::memory_order_relaxed); + uint32_t processedCount = 0; + + for (Entity* e : renderableEntities) { + if (!e) continue; + + if (++processedCount % 500 == 0) { + lastFrameUpdateTime.store(std::chrono::steady_clock::now(), std::memory_order_relaxed); + watchdogProgressIndex.store(processedCount, std::memory_order_relaxed); + } + + EntityASProperties& props = entityASProps[e]; + const std::string& name = e->GetName(); + + // 1. Determine material index and Material* + if (name.find("Ball_") == 0) { + props.materialIndex = 9999u; + props.material = modelLoader->GetMaterial("BallMaterial"); + } else { + size_t matPos = name.find("_Material_"); + if (matPos != std::string::npos) { + size_t numStart = matPos + 10; // length of "_Material_" + size_t numEnd = name.find('_', numStart); + if (numEnd != std::string::npos) { + try { + props.materialIndex = static_cast(std::stoi(name.substr(numStart, numEnd - numStart))); + if (numEnd + 1 < name.size()) { + props.material = modelLoader->GetMaterial(name.substr(numEnd + 1)); + } + } catch (...) {} + } + } + } + + if (props.material) { + materialByIndex[props.materialIndex] = props.material; + } + + // 2. Determine environment status + std::string nameLower = name; + std::transform(nameLower.begin(), nameLower.end(), nameLower.begin(), [](unsigned char c){ return std::tolower(c); }); + if (nameLower.find("sky") != std::string::npos || + nameLower.find("dome") != std::string::npos || + nameLower.find("env") != std::string::npos || + nameLower.find("bg") != std::string::npos || + nameLower.find("atmosphere") != std::string::npos || + nameLower.find("cloud") != std::string::npos || + nameLower.find("fog") != std::string::npos || + nameLower.find("background") != std::string::npos || + nameLower.find("exterior") != std::string::npos) { + props.isEnvironment = true; + } + + // 3. Safety check scale + if (!props.isEnvironment) { + auto transform = e->GetComponent(); + glm::vec3 scale = transform ? transform->GetScale() : glm::vec3(1.0f); + if (scale.x > 400.0f || scale.y > 400.0f || scale.z > 400.0f) { + props.isEnvironment = true; + } + } + + // 4. Determine opacity from cache/material + bool forceNoOpaqueCache = false; + bool forceOpaqueCache = false; + const Material* cachedMat = nullptr; + + auto itRes = entityResources.find(e); + if (itRes != entityResources.end()) { + ensureEntityMaterialCache(e, itRes->second); + const MaterialProperties& mp = itRes->second.cachedMaterialProps; + forceNoOpaqueCache = (mp.alphaMask > 0.5f) || itRes->second.cachedIsBlended || (mp.transmissionFactor > 0.01f) || itRes->second.cachedIsGlass || itRes->second.cachedIsLiquid; + forceOpaqueCache = !forceNoOpaqueCache; + cachedMat = itRes->second.cachedMaterial; + } + + const Material* mat = props.material ? props.material : cachedMat; + bool forceNoOpaqueMat = false; + bool forceOpaqueMat = false; + if (mat) { + forceNoOpaqueMat = (mat->alphaMode == "MASK") || (mat->alphaMode == "BLEND") || mat->isGlass || (mat->transmissionFactor > 0.01f); + forceOpaqueMat = (mat->alphaMode == "OPAQUE") && (!mat->isGlass) && (mat->transmissionFactor <= 0.01f); + } + + props.forceNoOpaque = forceNoOpaqueCache || forceNoOpaqueMat; + props.forceOpaque = (!props.forceNoOpaque) && (!props.isEnvironment) && (forceOpaqueCache || forceOpaqueMat); + } + } + + auto addTextureSlot = [&](const std::string& texId, uint32_t fallbackSlot) -> uint32_t { + if (texId.empty()) + return fallbackSlot; + std::string key = ResolveTextureId(texId); + auto it = rayQueryTexIndex.find(key); + if (it != rayQueryTexIndex.end()) + return it->second; + if (rayQueryTexCount >= RQ_MAX_TEX) + return fallbackSlot; + + uint32_t slot = rayQueryTexCount; + rayQueryTexKeys.push_back(key); + rayQueryTexFallbackSlots.push_back(fallbackSlot); + rayQueryTexIndex[key] = slot; + rayQueryTexCount++; + + // Ensure streaming is requested (CPU-side decode can happen off-thread; GPU upload stays on main thread). + try { + RegisterTextureUser(key, nullptr); + } catch (...) { + } + return slot; + }; + + uint32_t runningInstanceIndex = 0; + watchdogProgressLabel.store("AS: TLAS build (instances)", std::memory_order_relaxed); + + for (auto entity : renderableEntities) { + if (!entity) continue; + + const EntityASProperties& props = entityASProps[entity]; + auto meshComp = entity->GetComponent(); + uint32_t blasIndex = meshToBLAS.at(meshComp); + + auto transform = entity->GetComponent(); + const glm::mat4 entityModel = transform ? transform->GetModelMatrix() : glm::mat4(1.0f); + + const size_t meshInstCount = meshComp->GetInstanceCount(); + const bool hasInstance = (meshInstCount > 0); + const size_t instCount = std::max(1, meshInstCount); + + for (size_t iInst = 0; iInst < instCount; ++iInst) { + // Kick watchdog less frequently during instance loop + if (runningInstanceIndex % 1000 == 0) { + lastFrameUpdateTime.store(std::chrono::steady_clock::now(), std::memory_order_relaxed); + watchdogProgressIndex.store(runningInstanceIndex, std::memory_order_relaxed); + } + + glm::mat4 finalModel = entityModel; + uint32_t resolvedMaterialIndex = props.materialIndex; + + if (hasInstance && iInst < meshInstCount) { + const InstanceData& id = meshComp->GetInstance(iInst); + finalModel = entityModel * id.getModelMatrix(); + resolvedMaterialIndex = id.materialIndex; + } + + const float* m = glm::value_ptr(finalModel); + vk::TransformMatrixKHR vkTransform; + for (int row = 0; row < 3; row++) { + for (int col = 0; col < 4; col++) { + vkTransform.matrix[row][col] = m[col * 4 + row]; + } + } + + vk::AccelerationStructureInstanceKHR AS_Instance{}; + AS_Instance.transform = vkTransform; + AS_Instance.instanceCustomIndex = runningInstanceIndex; + AS_Instance.instanceShaderBindingTableRecordOffset = runningInstanceIndex; + + VkGeometryInstanceFlagsKHR instFlags = VK_GEOMETRY_INSTANCE_TRIANGLE_FACING_CULL_DISABLE_BIT_KHR; + if (props.forceNoOpaque) { + instFlags |= VK_GEOMETRY_INSTANCE_FORCE_NO_OPAQUE_BIT_KHR; + } else if (props.forceOpaque) { + instFlags |= VK_GEOMETRY_INSTANCE_FORCE_OPAQUE_BIT_KHR; + } + AS_Instance.flags = static_cast(instFlags); + AS_Instance.mask = props.isEnvironment ? 0x02 : 0x01; + AS_Instance.accelerationStructureReference = blasStructures[blasIndex].deviceAddress; + instances.push_back(AS_Instance); + + TlasInstanceRef ref{}; + ref.entity = entity; + ref.instanced = hasInstance; + ref.instanceIndex = static_cast(hasInstance ? iInst : 0); + tlasInstanceOrder.push_back(ref); + + const auto& meshRes = meshResources.at(meshComp); + GeometryInfo gi{}; + gi.vertexBufferAddress = getBufferDeviceAddress(device, *meshRes.vertexBuffer); + gi.indexBufferAddress = getBufferDeviceAddress(device, *meshRes.indexBuffer); + // Cache vertexCount from meshComp outside this hot inner loop if possible, + // but for now ensure it's at least not a redundant function call if the compiler doesn't optimize it. + const uint32_t vCount = static_cast(meshComp->GetVertices().size()); + gi.vertexCount = vCount; + gi.materialIndex = resolvedMaterialIndex; + gi.indexCount = meshRes.indexCount; + gi._pad0 = 0; + + glm::mat3 nrm = glm::transpose(glm::inverse(glm::mat3(finalModel))); + gi.normalMatrix0 = glm::vec4(nrm[0], 0.0f); + gi.normalMatrix1 = glm::vec4(nrm[1], 0.0f); + gi.normalMatrix2 = glm::vec4(nrm[2], 0.0f); + geometryInfos.push_back(gi); + + runningInstanceIndex++; + } + } + + // Build TLAS + + // Create instances buffer (persistent for TLAS UPDATE/Refit) + vk::DeviceSize instancesSize = sizeof(vk::AccelerationStructureInstanceKHR) * instances.size(); + auto [instancesBufferTmp, instancesAllocTmp] = createBufferPooled( + instancesSize, + vk::BufferUsageFlagBits::eAccelerationStructureBuildInputReadOnlyKHR | vk::BufferUsageFlagBits::eShaderDeviceAddress, + vk::MemoryPropertyFlagBits::eHostVisible | vk::MemoryPropertyFlagBits::eHostCoherent); + + // Upload instances - use mappedPtr directly + void* instancesData = instancesAllocTmp->mappedPtr; + if (!instancesData) { + std::cerr << "Failed to get mapped pointer for instances buffer\n"; + return false; + } + memcpy(instancesData, instances.data(), instancesSize); + + // Persist instances buffer/allocation and order for UPDATE (refit) + tlasInstancesBuffer = std::move(instancesBufferTmp); + tlasInstancesAllocation = std::move(instancesAllocTmp); + tlasInstanceCount = static_cast(instances.size()); + // tlasInstanceOrder already filled above in the same order as 'instances' + + // (Debug TLAS composition logs removed.) + + vk::DeviceAddress instancesAddress = getBufferDeviceAddress(device, *tlasInstancesBuffer); + + // TLAS geometry + vk::AccelerationStructureGeometryKHR tlasGeometry{}; + tlasGeometry.geometryType = vk::GeometryTypeKHR::eInstances; + // Do not force OPAQUE here; leave flags at 0 so ray queries may process + // transparency/glass more flexibly (any-hit not used in our path). + tlasGeometry.flags = {}; + tlasGeometry.geometry.instances = vk::AccelerationStructureGeometryInstancesDataKHR{ + .arrayOfPointers = VK_FALSE, + .data = instancesAddress + }; + + // TLAS build info + vk::AccelerationStructureBuildGeometryInfoKHR tlasBuildInfo{}; + tlasBuildInfo.type = vk::AccelerationStructureTypeKHR::eTopLevel; + tlasBuildInfo.flags = vk::BuildAccelerationStructureFlagBitsKHR::ePreferFastTrace | + vk::BuildAccelerationStructureFlagBitsKHR::eAllowUpdate; // enable UPDATE/Refit + tlasBuildInfo.mode = vk::BuildAccelerationStructureModeKHR::eBuild; + tlasBuildInfo.geometryCount = 1; + tlasBuildInfo.pGeometries = &tlasGeometry; + + auto instanceCount = static_cast(instances.size()); + + // Get TLAS size requirements + vk::AccelerationStructureBuildSizesInfoKHR tlasSizeInfo = device.getAccelerationStructureBuildSizesKHR( + vk::AccelerationStructureBuildTypeKHR::eDevice, + tlasBuildInfo, + instanceCount); + + // Create TLAS buffer + auto [tlasBuffer, tlasAlloc] = createBufferPooled( + tlasSizeInfo.accelerationStructureSize, + vk::BufferUsageFlagBits::eAccelerationStructureStorageKHR | vk::BufferUsageFlagBits::eShaderDeviceAddress, + vk::MemoryPropertyFlagBits::eDeviceLocal); + + // Create TLAS + vk::AccelerationStructureCreateInfoKHR tlasCreateInfo{}; + tlasCreateInfo.buffer = *tlasBuffer; + tlasCreateInfo.size = tlasSizeInfo.accelerationStructureSize; + tlasCreateInfo.type = vk::AccelerationStructureTypeKHR::eTopLevel; + + vk::raii::AccelerationStructureKHR tlasHandle(device, tlasCreateInfo); + + // Create TLAS scratch buffer (for initial build) + auto [tlasScratchBuffer, tlasScratchAlloc] = createBufferPooled( + tlasSizeInfo.buildScratchSize, + vk::BufferUsageFlagBits::eStorageBuffer | vk::BufferUsageFlagBits::eShaderDeviceAddress, + vk::MemoryPropertyFlagBits::eDeviceLocal); + + vk::DeviceAddress tlasScratchAddress = getBufferDeviceAddress(device, *tlasScratchBuffer); + + // Update TLAS build info (dereference RAII handle) + tlasBuildInfo.dstAccelerationStructure = *tlasHandle; + tlasBuildInfo.scratchData = tlasScratchAddress; + + // Keep TLAS scratch buffer alive until after GPU execution (after fence wait) + scratchBuffers.push_back(std::move(tlasScratchBuffer)); + scratchAllocations.push_back(std::move(tlasScratchAlloc)); + + // Ensure/update a persistent scratch buffer for TLAS UPDATE (refit) + // Allocate once sized to updateScratchSize; recreate if needed for larger scenes + if (!*tlasUpdateScratchBuffer || !tlasUpdateScratchAllocation || tlasUpdateScratchAllocation->size < tlasSizeInfo.updateScratchSize) { + auto [updBuf, updAlloc] = createBufferPooled( + tlasSizeInfo.updateScratchSize, + vk::BufferUsageFlagBits::eStorageBuffer | vk::BufferUsageFlagBits::eShaderDeviceAddress, + vk::MemoryPropertyFlagBits::eDeviceLocal); + tlasUpdateScratchBuffer = std::move(updBuf); + tlasUpdateScratchAllocation = std::move(updAlloc); + } + + // TLAS build range + vk::AccelerationStructureBuildRangeInfoKHR tlasRangeInfo{}; + tlasRangeInfo.primitiveCount = instanceCount; + tlasRangeInfo.primitiveOffset = 0; + tlasRangeInfo.firstVertex = 0; + tlasRangeInfo.transformOffset = 0; + + // Build TLAS - Vulkan-Hpp RAII takes array spans, not pointers + std::array tlasRangeInfos = {&tlasRangeInfo}; + cmdBuffer.buildAccelerationStructuresKHR(tlasBuildInfo, tlasRangeInfos); + + // Get TLAS device address (dereference RAII handle) + vk::AccelerationStructureDeviceAddressInfoKHR tlasAddressInfo{}; + tlasAddressInfo.accelerationStructure = *tlasHandle; + vk::DeviceAddress tlasAddress = device.getAccelerationStructureAddressKHR(tlasAddressInfo); + + // Store TLAS (move RAII handle to avoid copy) + tlasStructure.buffer = std::move(tlasBuffer); + tlasStructure.allocation = std::move(tlasAlloc); + tlasStructure.handle = std::move(tlasHandle); + tlasStructure.deviceAddress = tlasAddress; + + cmdBuffer.end(); + + // Submit and wait + vk::SubmitInfo submitInfo{}; + submitInfo.commandBufferCount = 1; + submitInfo.pCommandBuffers = &(*cmdBuffer); + + vk::raii::Fence fence(device, vk::FenceCreateInfo{}); + SubmitToQueue2(*graphicsQueue, *cmdBuffer, false, nullptr, *fence); + + // Wait with periodic watchdog kicks to avoid false hang detection on large scenes. + // Use a longer timeout (30s) for large scenes in Debug + (void) waitForFencesSafe(*fence, VK_TRUE, 30'000'000'000ULL); + // TLAS build completed on GPU + setASUi(true, + "AS: upload buffers", + totalSteps > 0 ? static_cast(static_cast(uniqueMeshes.size()) + 1u) / static_cast(totalSteps) : 0.0f, + static_cast(uniqueMeshes.size()) + 1u, + totalSteps); + + // (Verbose TLAS composition dumps removed; keep logs quiet.) + + // Record the counts we just built so we don't rebuild with smaller subsets later. + // Keep entity counts and TLAS instance counts separate to avoid unit mismatches. + lastASBuiltBLASCount = blasStructures.size(); + lastASBuiltInstanceCount = renderableEntities.size(); + lastBuiltUniqueMeshCount = uniqueMeshes.size(); + lastASBuiltTlasInstanceCount = instanceCount; + lastASBuildTime = std::chrono::steady_clock::now(); + + // Build geometry info buffer PER INSTANCE (same order as TLAS instances) + // geometryInfos already populated above in TLAS instance loop + + // Create and upload geometry info buffer + if (!geometryInfos.empty()) { + vk::DeviceSize geoInfoSize = sizeof(GeometryInfo) * geometryInfos.size(); + auto [geoInfoBuf, geoInfoAlloc] = createBufferPooled( + geoInfoSize, + vk::BufferUsageFlagBits::eStorageBuffer, + vk::MemoryPropertyFlagBits::eHostVisible | vk::MemoryPropertyFlagBits::eHostCoherent); + + void* geoInfoData = geoInfoAlloc->mappedPtr; + if (geoInfoData) { + memcpy(geoInfoData, geometryInfos.data(), geoInfoSize); + } + + geometryInfoBuffer = std::move(geoInfoBuf); + geometryInfoAllocation = std::move(geoInfoAlloc); + geometryInfoCountCPU = geometryInfos.size(); + + // (Verbose geometry info buffer stats removed.) + } + // Post buffers done + setASUi(true, + "AS: finalize", + totalSteps > 0 ? static_cast(static_cast(uniqueMeshes.size()) + 2u) / static_cast(totalSteps) : 1.0f, + static_cast(uniqueMeshes.size()) + 2u, + totalSteps); + + // Build material buffer with real materials from ModelLoader + { + // Build material buffer + + // Collect material indices used by this TLAS build. + // Do not rely only on entity-name parsing here: instanced geometry uses + // `InstanceData.materialIndex`, which may not appear in entity names (decals/foliage). + std::unordered_set usedMaterialIndices; + usedMaterialIndices.reserve(geometryInfos.size() + 16); + usedMaterialIndices.insert(0u); + + std::map materialIndexToName; // legacy fallback (debug/heuristics) + static constexpr uint32_t kMaxSupportedMaterialIndex = 100000u; + + size_t entityCount = 0; + for (Entity* entity : renderableEntities) { + std::string entityName = entity->GetName(); + + // Parse entity name: "modelName_Material_{materialIndex}_materialName" + size_t matPos = entityName.find("_Material_"); + if (matPos != std::string::npos) { + size_t numStart = matPos + 10; // length of "_Material_" + size_t numEnd = entityName.find('_', numStart); + + if (numEnd != std::string::npos) { + try { + uint32_t matIndex = std::stoi(entityName.substr(numStart, numEnd - numStart)); + if (matIndex > kMaxSupportedMaterialIndex) { + // Malformed entity name (or unexpected content) could yield a huge index. + // Skip to avoid allocating an enormous material table or writing out of bounds. + continue; + } + + // Extract material name (everything after materialIndex_) + std::string materialName = entityName.substr(numEnd + 1); + materialIndexToName[matIndex] = materialName; + usedMaterialIndices.insert(matIndex); + } catch (...) { + // Failed to parse, skip + } + } + } else if (entityName.find("Ball_") == 0) { + materialIndexToName[9999] = "BallMaterial"; + usedMaterialIndices.insert(9999u); + } + + entityCount++; + // Progress indicator removed (log-noise) + } + + // Authoritative: include indices referenced by built geometry infos (covers instanced materials). + for (const GeometryInfo& gi : geometryInfos) { + if (gi.materialIndex <= kMaxSupportedMaterialIndex) { + usedMaterialIndices.insert(gi.materialIndex); + } + } + + // (Verbose material discovery logs removed.) + + // Create default material for index 0 and any missing indices + MaterialData defaultMat{}; + defaultMat.albedo = glm::vec3(0.8f, 0.8f, 0.8f); + defaultMat.metallic = 0.0f; + defaultMat.roughness = 0.5f; + defaultMat.emissive = glm::vec3(0.0f); + defaultMat.ao = 1.0f; + defaultMat.ior = 1.5f; + defaultMat.emissiveStrength = 1.0f; + defaultMat.alpha = 1.0f; + defaultMat.transmissionFactor = 0.0f; + defaultMat.alphaCutoff = 0.5f; + defaultMat.alphaMode = 0; // OPAQUE + defaultMat.isGlass = 0; + defaultMat.isLiquid = 0; + // Thick-glass defaults + defaultMat.absorptionColor = glm::vec3(1.0f); + defaultMat.absorptionDistance = 1.0f; + defaultMat.thinWalled = 1u; // default to thin to avoid over-darkening + // Texture-set flags: -1 = no texture bound for that channel + defaultMat.baseColorTextureSet = -1; + defaultMat.physicalDescriptorTextureSet = -1; + defaultMat.normalTextureSet = -1; + defaultMat.occlusionTextureSet = -1; + defaultMat.emissiveTextureSet = -1; + // Default texture indices (reserved slots) + defaultMat.baseColorTexIndex = static_cast(RQ_SLOT_DEFAULT_BASECOLOR); + defaultMat.normalTexIndex = static_cast(RQ_SLOT_DEFAULT_NORMAL); + defaultMat.physicalTexIndex = static_cast(RQ_SLOT_DEFAULT_METALROUGH); + defaultMat.occlusionTexIndex = static_cast(RQ_SLOT_DEFAULT_OCCLUSION); + defaultMat.emissiveTexIndex = static_cast(RQ_SLOT_DEFAULT_EMISSIVE); + defaultMat.useSpecGlossWorkflow = 0; + defaultMat.glossinessFactor = 1.0f; + defaultMat.specularFactor = glm::vec3(0.04f); + defaultMat.hasEmissiveStrengthExt = 0; + defaultMat._padMat[0] = defaultMat._padMat[1] = defaultMat._padMat[2] = 0; + + // Build material array with proper indexing + std::vector materials; + + // Determine max material index to size the array + uint32_t maxMaterialIndex = 0; + for (uint32_t index : usedMaterialIndices) { + maxMaterialIndex = std::max(maxMaterialIndex, index); + } + maxMaterialIndex = std::min(maxMaterialIndex, kMaxSupportedMaterialIndex); + + // Ensure minimum size of 100 materials for safety (matches original implementation) + uint32_t materialCount = std::max(maxMaterialIndex + 1, 100u); + materials.resize(materialCount, defaultMat); + + // Capture per-material texture paths (for streaming requests and debugging) + rqMaterialTexPaths.clear(); + rqMaterialTexPaths.resize(materials.size()); + + // Populate materials from ModelLoader + uint32_t loadedCount = 0; + uint32_t glassCount = 0; + uint32_t transparentCount = 0; + if (modelLoader) { + // Populate materials from ModelLoader + size_t matProcessed = 0; + for (uint32_t index : usedMaterialIndices) { + if (index >= materials.size()) + continue; + + // `materialIndex` in this engine is not guaranteed to match the glTF + // material array index (especially for instanced meshes). Do not resolve by numeric + // index here; prefer the name mapping when available and otherwise fall back to a + // safe default material. + const Material* sourceMat = nullptr; + std::string materialName; + auto itName = materialIndexToName.find(index); + if (itName != materialIndexToName.end()) { + materialName = itName->second; + sourceMat = modelLoader->GetMaterial(materialName); + } + + if (!sourceMat && index == 9999u) { + // Create a virtual red material for spawned balls + MaterialData& matData = materials[index]; + matData.albedo = glm::vec3(1.0f, 0.05f, 0.05f); // Bright red + matData.roughness = 0.4f; + matData.metallic = 0.0f; + matData.ao = 1.0f; + matData.emissive = glm::vec3(0.0f); + matData.alpha = 1.0f; + matData.alphaMode = 0; // OPAQUE + matData.isGlass = 0; + matData.transmissionFactor = 0.0f; + matData.baseColorTextureSet = -1; + matData.normalTextureSet = -1; + matData.physicalDescriptorTextureSet = -1; + matData.occlusionTextureSet = -1; + matData.emissiveTextureSet = -1; + continue; + } + + if (sourceMat) { + MaterialData& matData = materials[index]; + + // Copy PBR properties from Material to MaterialData + matData.albedo = sourceMat->albedo; + matData.metallic = sourceMat->metallic; + matData.emissive = sourceMat->emissive; + matData.roughness = sourceMat->roughness; + matData.ao = sourceMat->ao; + matData.ior = sourceMat->ior; + matData.emissiveStrength = sourceMat->emissiveStrength; + matData.alpha = sourceMat->alpha; + matData.transmissionFactor = sourceMat->transmissionFactor; + matData.alphaCutoff = sourceMat->alphaCutoff; + + // Thick-glass parameters (no glTF volume parsing yet; use sensible defaults) + matData.absorptionColor = glm::vec3(1.0f); + matData.absorptionDistance = 1.0f; + // Consider engine-tagged glass as thick by default; others thin + matData.thinWalled = (sourceMat->isGlass ? 0u : 1u); + // Alpha mode encoding must match `shaders/ray_query.slang`: + // 0=OPAQUE, 1=MASK, 2=BLEND + if (sourceMat->alphaMode == "MASK") { + matData.alphaMode = 1; + } else if (sourceMat->alphaMode == "BLEND") { + matData.alphaMode = 2; + } else { + matData.alphaMode = 0; + } + // Heuristics to improve glass tagging for Ray Query path + // Many Bistro assets do not carry transmission extensions; use name hints + { + std::string lower = materialName; + std::transform(lower.begin(), lower.end(), lower.begin(), [](unsigned char c) { return static_cast(std::tolower(c)); }); + + // Decal materials in Bistro are primarily identified by their baseColor texture path. + // `Material::albedoTexturePath` is often an alias ID like `gltf_texture_#`. + // Resolve it to the canonical path so we can detect `textures/decals/...` reliably. + if (matData.alphaMode == 0 && !sourceMat->albedoTexturePath.empty()) { + std::string resolvedBase = ResolveTextureId(sourceMat->albedoTexturePath); + std::string baseLower = resolvedBase; + std::transform(baseLower.begin(), baseLower.end(), baseLower.begin(), [](unsigned char c) { return static_cast(std::tolower(c)); }); + if (baseLower.find("/decals/") != std::string::npos || + baseLower.find("\\decals\\") != std::string::npos || + baseLower.find("_decal") != std::string::npos) { + matData.alphaMode = 2; + } + } + + const bool hasGlassWord = lower.find("glass") != std::string::npos; + const bool isWindowPane = (lower.find("window") != std::string::npos) || (lower.find("pane") != std::string::npos); + const bool isLampGlass = (lower.find("lamp") != std::string::npos) && hasGlassWord; + const bool isGlassware = (lower.find("goblet") != std::string::npos) || (lower.find("bottle") != std::string::npos) || (lower.find("wine") != std::string::npos); + + bool markGlass = sourceMat->isGlass || hasGlassWord || isWindowPane || isLampGlass || isGlassware; + matData.isGlass = markGlass ? 1u : 0u; + matData.isLiquid = sourceMat->isLiquid ? 1u : 0u; + + // Ensure non-zero transmission for glass-like materials lacking the extension + if (markGlass && matData.transmissionFactor < 0.85f) { + matData.transmissionFactor = 0.9f; + } + + // Thin/thick hint refinement: panes/lamps are thin shells; glassware is thick + if (isWindowPane || isLampGlass) { + matData.thinWalled = 1u; + } else if (isGlassware) { + matData.thinWalled = 0u; + } + } + + // Texture-set flags (raster parity): -1 means no texture is authored for that slot. + matData.baseColorTextureSet = sourceMat->albedoTexturePath.empty() ? -1 : 0; + if (sourceMat->useSpecularGlossiness) { + matData.physicalDescriptorTextureSet = sourceMat->specGlossTexturePath.empty() ? -1 : 0; + } else { + matData.physicalDescriptorTextureSet = sourceMat->metallicRoughnessTexturePath.empty() ? -1 : 0; + } + matData.normalTextureSet = sourceMat->normalTexturePath.empty() ? -1 : 0; + matData.occlusionTextureSet = sourceMat->occlusionTexturePath.empty() ? -1 : 0; + matData.emissiveTextureSet = sourceMat->emissiveTexturePath.empty() ? -1 : 0; + + // Texture paths and stable indices into the Ray Query texture table (binding 6) + if (index < rqMaterialTexPaths.size()) { + RQMaterialTexPaths& paths = rqMaterialTexPaths[index]; + // Resolve alias IDs (`gltf_texture_*`) to canonical keys (file paths) so RayQuery + // samples the correct textures and decal heuristics can match paths. + paths.baseColor = ResolveTextureId(sourceMat->albedoTexturePath); + paths.normal = ResolveTextureId(sourceMat->normalTexturePath); + paths.physical = ResolveTextureId(sourceMat->useSpecularGlossiness ? sourceMat->specGlossTexturePath : sourceMat->metallicRoughnessTexturePath); + paths.occlusion = ResolveTextureId(sourceMat->occlusionTexturePath); + paths.emissive = ResolveTextureId(sourceMat->emissiveTexturePath); + + matData.baseColorTexIndex = static_cast(addTextureSlot(paths.baseColor, RQ_SLOT_DEFAULT_BASECOLOR)); + matData.normalTexIndex = static_cast(addTextureSlot(paths.normal, RQ_SLOT_DEFAULT_NORMAL)); + matData.physicalTexIndex = static_cast(addTextureSlot(paths.physical, RQ_SLOT_DEFAULT_METALROUGH)); + matData.occlusionTexIndex = static_cast(addTextureSlot(paths.occlusion, RQ_SLOT_DEFAULT_OCCLUSION)); + matData.emissiveTexIndex = static_cast(addTextureSlot(paths.emissive, RQ_SLOT_DEFAULT_EMISSIVE)); + + } + + // Specular-glossiness workflow support + matData.useSpecGlossWorkflow = sourceMat->useSpecularGlossiness ? 1 : 0; + matData.glossinessFactor = sourceMat->glossinessFactor; + matData.specularFactor = sourceMat->specularFactor; + matData.hasEmissiveStrengthExt = (std::abs(sourceMat->emissiveStrength - 1.0f) > 1e-6f) ? 1 : 0; + matData._padMat[0] = matData._padMat[1] = matData._padMat[2] = 0; + + // Track glass and transparent materials for statistics + if (sourceMat->isGlass) { + glassCount++; + } + if (sourceMat->transmissionFactor > 0.1f) { + transparentCount++; + } + + loadedCount++; + } else { + std::cerr << "Warning: Material '" << materialName + << "' not found in ModelLoader for index " << index << "\n"; + } + + matProcessed++; + } + } else { + std::cerr << "Warning: ModelLoader not available, using default materials\n"; + } + + // Create and upload material buffer (always create, even if no materials found) + vk::DeviceSize matSize = sizeof(MaterialData) * materials.size(); + auto [matBuf, matAlloc] = createBufferPooled( + matSize, + vk::BufferUsageFlagBits::eStorageBuffer, + vk::MemoryPropertyFlagBits::eHostVisible | vk::MemoryPropertyFlagBits::eHostCoherent); + + void* matData = matAlloc->mappedPtr; + if (matData) { + memcpy(matData, materials.data(), matSize); + } + + materialBuffer = std::move(matBuf); + materialAllocation = std::move(matAlloc); + + // (Verbose material buffer stats removed.) + + // Record material count for shader-side bounds (provided via UBO) + materialCountCPU = materials.size(); + } + + // The TLAS/material/geometry buffers and texture table contents may have changed. + // Mark ray query descriptor sets dirty so the render thread refreshes them at the next safe point. + const uint32_t allFramesMask = (MAX_FRAMES_IN_FLIGHT >= 32u) ? 0xFFFFFFFFu : ((1u << MAX_FRAMES_IN_FLIGHT) - 1u); + rayQueryDescriptorsDirtyMask.fetch_or(allFramesMask, std::memory_order_relaxed); + + setASUi(true, "AS: done", 1.0f, totalSteps, totalSteps); + const auto elapsedMs = std::chrono::duration_cast(std::chrono::steady_clock::now() - asStartCpu).count(); + std::cout << "AS build completed in " << (static_cast(elapsedMs) / 1000.0) + << "s (uniqueMeshes=" << uniqueMeshes.size() + << ", entities=" << renderableEntities.size() + << ", tlasInstances=" << instanceCount << ")\n"; + auto asEndTime = std::chrono::steady_clock::now(); + auto asDurationMs = std::chrono::duration_cast(asEndTime - asStartTime).count(); + return true; + } catch (const std::exception& e) { + const uint64_t startNs = asBuildUiStartNs.load(std::memory_order_relaxed); + if (startNs != 0) { + const uint64_t nowNs = static_cast( + std::chrono::duration_cast( + std::chrono::steady_clock::now().time_since_epoch()) + .count()); + const double secs = (nowNs > startNs) ? (static_cast(nowNs - startNs) / 1'000'000'000.0) : 0.0; + std::cerr << "Failed to build acceleration structures after " << secs << "s: " << e.what() << std::endl; + } else { + std::cerr << "Failed to build acceleration structures: " << e.what() << std::endl; + } + return false; + } +} + +bool Renderer::refitTopLevelAS(const std::vector& entities, CameraComponent* camera) { + try { + if (!rayQueryEnabled || !accelerationStructureEnabled) + return false; + if (!*tlasStructure.handle) + return false; + if (!*tlasInstancesBuffer || !tlasInstancesAllocation || tlasInstanceOrder.size() != tlasInstanceCount) + return false; + + // Update instance transforms in the persistent instances buffer + auto* instPtr = reinterpret_cast(tlasInstancesAllocation->mappedPtr); + if (!instPtr) + return false; + + auto lastKick = std::chrono::steady_clock::now(); + auto kickWatchdog = [&]() { + auto now = std::chrono::steady_clock::now(); + if (now - lastKick > std::chrono::milliseconds(200)) { + lastFrameUpdateTime.store(now, std::memory_order_relaxed); + lastKick = now; + } + }; + + // Optional culling parity with raster: mask TLAS instances using the same frustum + distance-LOD checks. + // Use the same culling toggles as the raster path. + const bool doFrustumCulling = enableFrustumCulling && camera; + const bool doDistanceLOD = enableDistanceLOD && camera; + FrustumPlanes frustum{}; + if (doFrustumCulling) { + const glm::mat4 vp = camera->GetProjectionMatrix() * camera->GetViewMatrix(); + frustum = extractFrustumPlanes(vp); + } + const float camFovRad = camera ? glm::radians(camera->GetFieldOfView()) : glm::radians(60.0f); + + for (uint32_t i = 0; i < tlasInstanceCount; ++i) { + kickWatchdog(); + const TlasInstanceRef& ref = tlasInstanceOrder[i]; + Entity* entity = ref.entity; + if (!entity || !entity->IsActive()) { + instPtr[i].mask = 0u; + continue; + } + auto* transform = entity->GetComponent(); + glm::mat4 entityModel = transform ? transform->GetModelMatrix() : glm::mat4(1.0f); + + // If this TLAS entry represents a MeshComponent instance, multiply by the instance's model + glm::mat4 finalModel = entityModel; + auto* meshComp = entity->GetComponent(); + if (ref.instanced) { + if (meshComp && ref.instanceIndex < meshComp->GetInstanceCount()) { + const InstanceData& id = meshComp->GetInstance(ref.instanceIndex); + finalModel = entityModel * id.getModelMatrix(); + } + } + + const float* m = glm::value_ptr(finalModel); + vk::TransformMatrixKHR vkTransform; + for (int row = 0; row < 3; row++) { + for (int col = 0; col < 4; col++) { + vkTransform.matrix[row][col] = m[col * 4 + row]; + } + } + instPtr[i].transform = vkTransform; + + // Apply culling via instance mask (mask=0 => skipped by ray queries with mask=0xFF). + uint32_t mask = 0xFFu; + + // Determine if environment (skybox/dome) - ALWAYS check this to ensure correct shadow masking + std::string nameLower = entity->GetName(); + std::transform(nameLower.begin(), nameLower.end(), nameLower.begin(), [](unsigned char c){ return std::tolower(c); }); + bool isEnvironment = (nameLower.find("sky") != std::string::npos) || + (nameLower.find("dome") != std::string::npos) || + (nameLower.find("env") != std::string::npos) || + (nameLower.find("bg") != std::string::npos) || + (nameLower.find("atmosphere") != std::string::npos) || + (nameLower.find("cloud") != std::string::npos) || + (nameLower.find("fog") != std::string::npos) || + (nameLower.find("background") != std::string::npos) || + (nameLower.find("exterior") != std::string::npos); + + // Safety check: if object is enormous, treat it as environment to prevent occlusion + if (!isEnvironment) { + glm::vec3 scale = transform ? transform->GetScale() : glm::vec3(1.0f); + if (scale.x > 400.0f || scale.y > 400.0f || scale.z > 400.0f) { + isEnvironment = true; + } + } + + // Default mask: 0x02 for environment (ignored by shadow rays), 0x01 for geometry (casters) + mask = isEnvironment ? 0x02u : 0x01u; + + if ((doFrustumCulling || doDistanceLOD) && meshComp && camera && meshComp->HasLocalAABB()) { + // (RayQuery): Avoid dropping instances from TLAS via mask=0 based on heuristic culling. + // AABBs for some instanced meshes can be unreliable, and `mask=0` removes the instance from all + // ray queries, causing visible objects to vanish. Keep TLAS coverage stable. + const bool applyFrustumCullingToRayQueryTLASMask = false; + const bool applyDistanceLodToRayQueryTLASMask = false; + bool visible = true; + glm::vec3 wmin{}, wmax{}; + transformAABB(finalModel, meshComp->GetBaseMeshAABBMin(), meshComp->GetBaseMeshAABBMax(), wmin, wmax); + + if (doFrustumCulling && applyFrustumCullingToRayQueryTLASMask && !aabbIntersectsFrustum(wmin, wmax, frustum)) { + visible = false; + } + + if (visible && doDistanceLOD && applyDistanceLodToRayQueryTLASMask) { + // Match raster LOD heuristic (projected-size skip) + glm::vec3 center = 0.5f * (wmin + wmax); + glm::vec3 extents = 0.5f * (wmax - wmin); + float radius = glm::length(extents); + if (radius > 0.0f) { + glm::vec4 centerVS4 = camera->GetViewMatrix() * glm::vec4(center, 1.0f); + float z = std::abs(centerVS4.z); + if (z > 1e-3f) { + float pixelRadius = (radius * static_cast(swapChainExtent.height)) / + (z * 2.0f * std::tan(camFovRad * 0.5f)); + float pixelDiameter = pixelRadius * 2.0f; + + bool useBlended = false; + auto it = entityResources.find(entity); + if (it != entityResources.end()) { + ensureEntityMaterialCache(entity, it->second); + useBlended = it->second.cachedIsBlended; + } + + float threshold = useBlended ? lodPixelThresholdTransparent : lodPixelThresholdOpaque; + if (pixelDiameter < threshold) { + visible = false; + } + } + } + + } + + if (!visible) { + mask = 0u; + } + } + instPtr[i].mask = mask; + } + + // Prepare UPDATE build info + vk::DeviceAddress instancesAddress = getBufferDeviceAddress(device, *tlasInstancesBuffer); + + vk::AccelerationStructureGeometryKHR tlasGeometry{}; + tlasGeometry.geometryType = vk::GeometryTypeKHR::eInstances; + tlasGeometry.flags = {}; + tlasGeometry.geometry.instances = vk::AccelerationStructureGeometryInstancesDataKHR{ + .arrayOfPointers = VK_FALSE, + .data = instancesAddress + }; + + vk::AccelerationStructureBuildGeometryInfoKHR tlasBuildInfo{}; + tlasBuildInfo.type = vk::AccelerationStructureTypeKHR::eTopLevel; + tlasBuildInfo.flags = vk::BuildAccelerationStructureFlagBitsKHR::ePreferFastTrace | + vk::BuildAccelerationStructureFlagBitsKHR::eAllowUpdate; + tlasBuildInfo.mode = vk::BuildAccelerationStructureModeKHR::eUpdate; + tlasBuildInfo.geometryCount = 1; + tlasBuildInfo.pGeometries = &tlasGeometry; + tlasBuildInfo.srcAccelerationStructure = *tlasStructure.handle; + tlasBuildInfo.dstAccelerationStructure = *tlasStructure.handle; + + if (!*tlasUpdateScratchBuffer || !tlasUpdateScratchAllocation) { + // No update scratch; cannot refit + return false; + } + vk::DeviceAddress updateScratch = getBufferDeviceAddress(device, *tlasUpdateScratchBuffer); + tlasBuildInfo.scratchData = updateScratch; + + vk::AccelerationStructureBuildRangeInfoKHR tlasRangeInfo{}; + tlasRangeInfo.primitiveCount = tlasInstanceCount; + tlasRangeInfo.primitiveOffset = 0; + tlasRangeInfo.firstVertex = 0; + tlasRangeInfo.transformOffset = 0; + + // Create transient command buffer for UPDATE + vk::CommandPoolCreateInfo poolInfo{}; + poolInfo.flags = vk::CommandPoolCreateFlagBits::eTransient; + poolInfo.queueFamilyIndex = queueFamilyIndices.graphicsFamily.value(); + vk::raii::CommandPool cmdPool(device, poolInfo); + + vk::CommandBufferAllocateInfo allocInfo{}; + allocInfo.commandPool = *cmdPool; + allocInfo.level = vk::CommandBufferLevel::ePrimary; + allocInfo.commandBufferCount = 1; + vk::raii::CommandBuffers cmdBuffers(device, allocInfo); + vk::raii::CommandBuffer& cmd = cmdBuffers[0]; + cmd.begin(vk::CommandBufferBeginInfo{.flags = vk::CommandBufferUsageFlagBits::eOneTimeSubmit}); + + std::array ranges = {&tlasRangeInfo}; + cmd.buildAccelerationStructuresKHR(tlasBuildInfo, ranges); + + cmd.end(); + + // Submit and wait + vk::SubmitInfo submitInfo{}; + submitInfo.commandBufferCount = 1; + submitInfo.pCommandBuffers = &(*cmd); + vk::raii::Fence fence(device, vk::FenceCreateInfo{}); + SubmitToQueue2(*graphicsQueue, *cmd, false, nullptr, *fence); + // Wait with periodic watchdog kicks to avoid false hang detection on long refits. + // Use a longer timeout (30s) for large scenes in Debug + (void) waitForFencesSafe(*fence, VK_TRUE, 30'000'000'000ULL); + return true; + } catch (const std::exception& e) { + std::cerr << "Failed to refit TLAS: " << e.what() << std::endl; + return false; + } +} + +/** + * @brief Update ray query descriptor sets with current resources. + * + * Binds UBO, TLAS, output image, and light buffer to the descriptor set. + * + * @param frameIndex The frame index to update. + * @return True if successful, false otherwise. + */ +bool Renderer::updateRayQueryDescriptorSets(uint32_t frameIndex, const std::vector& entities) { + if (!rayQueryEnabled || !accelerationStructureEnabled) { + return false; + } + if (frameIndex >= MAX_FRAMES_IN_FLIGHT) { + return false; + } + + // Do not update descriptors while descriptor sets are known invalid + if (!descriptorSetsValid.load(std::memory_order_relaxed)) { + return false; + } + + // Ensure descriptor sets exist for this frame; if missing/invalid, (re)allocate them now at the safe point + auto ensureRayQuerySets = [&]() -> bool { + try { + if (rayQueryDescriptorSets.empty() || frameIndex >= rayQueryDescriptorSets.size()) { + std::vector layouts(MAX_FRAMES_IN_FLIGHT, *rayQueryDescriptorSetLayout); + vk::DescriptorSetAllocateInfo allocInfo{}; + allocInfo.descriptorPool = *descriptorPool; + allocInfo.descriptorSetCount = MAX_FRAMES_IN_FLIGHT; + allocInfo.pSetLayouts = layouts.data(); { + std::lock_guard lk(descriptorMutex); + rayQueryDescriptorSets = vk::raii::DescriptorSets(device, allocInfo); + } + } + // Validate the handle for the current frame + vk::DescriptorSet testHandle = *rayQueryDescriptorSets[frameIndex]; + if (!testHandle) { + // Reallocate once more if handle is null + std::vector layouts(MAX_FRAMES_IN_FLIGHT, *rayQueryDescriptorSetLayout); + vk::DescriptorSetAllocateInfo allocInfo{}; + allocInfo.descriptorPool = *descriptorPool; + allocInfo.descriptorSetCount = MAX_FRAMES_IN_FLIGHT; + allocInfo.pSetLayouts = layouts.data(); { + std::lock_guard lk(descriptorMutex); + rayQueryDescriptorSets = vk::raii::DescriptorSets(device, allocInfo); + } + testHandle = *rayQueryDescriptorSets[frameIndex]; + if (!testHandle) + return false; + } + return true; + } catch (const std::exception& e) { + std::cerr << "Ray query descriptor set (re)allocation failed: " << e.what() << "\n"; + return false; + } + }; + if (!ensureRayQuerySets()) { + return false; + } + + // Validate descriptor set handle is valid before dereferencing + try { + vk::DescriptorSet testHandle = *rayQueryDescriptorSets[frameIndex]; + if (!testHandle) { + // Try reallocate once more + if (!ensureRayQuerySets()) + return false; + } + } catch (const std::exception& e) { + std::cerr << "Ray query descriptor set handle invalid for frame " << frameIndex << ": " << e.what() << "\n"; + if (!ensureRayQuerySets()) + return false; + } + + // Check if TLAS handle is valid (dereference RAII handle to check underlying VkAccelerationStructureKHR) + if (!*tlasStructure.handle) { + std::cerr << "TLAS not built - cannot update ray query descriptor sets\n"; + return false; + } + + // Avoid doing expensive updates every frame. + // Binding 6 is a large descriptor array; updating it each frame can stall the CPU badly. + if (rayQueryDescriptorsWritten.size() != MAX_FRAMES_IN_FLIGHT) { + rayQueryDescriptorsWritten.assign(MAX_FRAMES_IN_FLIGHT, false); + } + const uint32_t bitMask = (1u << frameIndex); + const bool dirty = (rayQueryDescriptorsDirtyMask.load(std::memory_order_relaxed) & bitMask) != 0u; + const bool first = !rayQueryDescriptorsWritten[frameIndex]; + if (!dirty && !first) { + // Nothing changed that requires descriptor rebind for this frame. + return true; + } + + // Frame index alignment check: ensure we are updating descriptor set for the frame being recorded + if (frameIndex != currentFrame) { + // Not fatal, but indicates a mismatch in frame scheduling + // Avoid noisy logs every frame + } + + // TLAS is valid at this point; avoid verbose logging in default builds + vk::AccelerationStructureKHR tlasHandleValue = *tlasStructure.handle; + + if (lightStorageBuffers.empty() || frameIndex >= lightStorageBuffers.size()) { + std::cerr << "Light storage buffers not initialized\n"; + return false; + } + + try { + // NOTE: Ray Query no longer stores per-instance texture indices in `GeometryInfo`. + // Textures are resolved per-material via the material buffer, and the descriptor array + // is rebuilt each update from current streamed texture handles. + + std::vector writes; + vk::DescriptorBufferInfo uboInfo{}; + vk::WriteDescriptorSetAccelerationStructureKHR tlasInfo{}; + vk::DescriptorImageInfo imageInfo{}; + vk::DescriptorBufferInfo lightInfo{}; + vk::DescriptorBufferInfo geoInfo{}; + vk::DescriptorBufferInfo matInfo{}; + + // NOTE: Do not write into mapped geometry info here. The buffer is built at AS build time + // and remains immutable to avoid races with refit and descriptor updates. + + // Binding 0: UBO - Use dedicated ray query UBO (not entity UBO) + if (rayQueryUniformBuffers.empty() || frameIndex >= rayQueryUniformBuffers.size()) { + std::cerr << "Ray query UBO not initialized for frame " << frameIndex << "\n"; + return false; + } + + uboInfo.buffer = *rayQueryUniformBuffers[frameIndex]; + uboInfo.offset = 0; + uboInfo.range = sizeof(RayQueryUniformBufferObject); + + vk::WriteDescriptorSet uboWrite{}; + uboWrite.dstSet = *rayQueryDescriptorSets[frameIndex]; + uboWrite.dstBinding = 0; + uboWrite.dstArrayElement = 0; + uboWrite.descriptorCount = 1; + uboWrite.descriptorType = vk::DescriptorType::eUniformBuffer; + uboWrite.pBufferInfo = &uboInfo; + writes.push_back(uboWrite); + + // Binding 1: TLAS (get address of underlying VkAccelerationStructureKHR) + tlasInfo.accelerationStructureCount = 1; + tlasInfo.pAccelerationStructures = &tlasHandleValue; + + vk::WriteDescriptorSet tlasWrite{}; + tlasWrite.dstSet = *rayQueryDescriptorSets[frameIndex]; + tlasWrite.dstBinding = 1; + tlasWrite.dstArrayElement = 0; + tlasWrite.descriptorCount = 1; + tlasWrite.descriptorType = vk::DescriptorType::eAccelerationStructureKHR; + tlasWrite.pNext = &tlasInfo; + writes.push_back(tlasWrite); + + // Binding 2: Output image + imageInfo.imageView = *rayQueryOutputImageView; + imageInfo.imageLayout = vk::ImageLayout::eGeneral; + + vk::WriteDescriptorSet imageWrite{}; + imageWrite.dstSet = *rayQueryDescriptorSets[frameIndex]; + imageWrite.dstBinding = 2; + imageWrite.dstArrayElement = 0; + imageWrite.descriptorCount = 1; + imageWrite.descriptorType = vk::DescriptorType::eStorageImage; + imageWrite.pImageInfo = &imageInfo; + writes.push_back(imageWrite); + + // Binding 3: Light buffer + lightInfo.buffer = *lightStorageBuffers[frameIndex].buffer; + lightInfo.offset = 0; + lightInfo.range = VK_WHOLE_SIZE; + + vk::WriteDescriptorSet lightWrite{}; + lightWrite.dstSet = *rayQueryDescriptorSets[frameIndex]; + lightWrite.dstBinding = 3; + lightWrite.dstArrayElement = 0; + lightWrite.descriptorCount = 1; + lightWrite.descriptorType = vk::DescriptorType::eStorageBuffer; + lightWrite.pBufferInfo = &lightInfo; + writes.push_back(lightWrite); + + // Binding 4: Geometry info buffer (vertex/index addresses + material indices) + if (*geometryInfoBuffer) { + geoInfo.buffer = *geometryInfoBuffer; + geoInfo.offset = 0; + geoInfo.range = VK_WHOLE_SIZE; + + vk::WriteDescriptorSet geoWrite{}; + geoWrite.dstSet = *rayQueryDescriptorSets[frameIndex]; + geoWrite.dstBinding = 4; + geoWrite.dstArrayElement = 0; + geoWrite.descriptorCount = 1; + geoWrite.descriptorType = vk::DescriptorType::eStorageBuffer; + geoWrite.pBufferInfo = &geoInfo; + writes.push_back(geoWrite); + } + + // Binding 5: Material buffer (PBR material properties) + if (*materialBuffer) { + matInfo.buffer = *materialBuffer; + matInfo.offset = 0; + matInfo.range = VK_WHOLE_SIZE; + + vk::WriteDescriptorSet matWrite{}; + matWrite.dstSet = *rayQueryDescriptorSets[frameIndex]; + matWrite.dstBinding = 5; + matWrite.dstArrayElement = 0; + matWrite.descriptorCount = 1; + matWrite.descriptorType = vk::DescriptorType::eStorageBuffer; + matWrite.pBufferInfo = &matInfo; + writes.push_back(matWrite); + } + + // Binding 6: Ray Query texture table (combined image samplers) + // IMPORTANT: Do NOT cache VkImageView/VkSampler handles across frames; textures can stream + // and their handles may be destroyed/recreated. + if (rayQueryTexKeys.size() < RQ_SLOT_DEFAULT_EMISSIVE + 1 || rayQueryTexFallbackSlots.size() < RQ_SLOT_DEFAULT_EMISSIVE + 1) { + // Should be seeded during AS build; if not, fall back to using the generic default texture in all slots. + rayQueryTexKeys.resize(RQ_SLOT_DEFAULT_EMISSIVE + 1); + rayQueryTexFallbackSlots.resize(RQ_SLOT_DEFAULT_EMISSIVE + 1); + rayQueryTexCount = std::max(rayQueryTexCount, static_cast(rayQueryTexKeys.size())); + } + + const uint32_t copyCount = std::min(rayQueryTexCount, RQ_MAX_TEX); + // First-time init writes the full array with defaults so the set is fully defined. + // Subsequent refreshes update only the active range [0, copyCount), which is much faster. + const bool initFullArray = first; + const uint32_t writeCount = initFullArray ? RQ_MAX_TEX : copyCount; + std::vector rqArray(writeCount, + vk::DescriptorImageInfo{ + .sampler = *defaultTextureResources.textureSampler, + .imageView = *defaultTextureResources.textureImageView, + .imageLayout = vk::ImageLayout::eShaderReadOnlyOptimal + }); + if (copyCount > 0) { + // Fill active slots under a short-lived shared lock, then release before taking descriptorMutex. + std::shared_lock texLock(textureResourcesMutex); + auto fillSlot = [&](uint32_t slot) { + if (slot >= copyCount) + return; + const std::string& key = rayQueryTexKeys[slot]; + if (!key.empty()) { + auto itTex = textureResources.find(key); + if (itTex != textureResources.end() && *itTex->second.textureImageView != VK_NULL_HANDLE && *itTex->second.textureSampler != VK_NULL_HANDLE) { + rqArray[slot].sampler = *itTex->second.textureSampler; + rqArray[slot].imageView = *itTex->second.textureImageView; + rqArray[slot].imageLayout = vk::ImageLayout::eShaderReadOnlyOptimal; + return; + } + } + + // Not ready/missing: use slot-specific fallback. + uint32_t fb = (slot < rayQueryTexFallbackSlots.size()) ? rayQueryTexFallbackSlots[slot] : RQ_SLOT_DEFAULT_BASECOLOR; + if (fb >= copyCount) + fb = RQ_SLOT_DEFAULT_BASECOLOR; + const std::string& fbKey = (fb < rayQueryTexKeys.size()) ? rayQueryTexKeys[fb] : std::string{}; + if (!fbKey.empty()) { + auto itTex = textureResources.find(fbKey); + if (itTex != textureResources.end() && *itTex->second.textureImageView != VK_NULL_HANDLE && *itTex->second.textureSampler != VK_NULL_HANDLE) { + rqArray[slot].sampler = *itTex->second.textureSampler; + rqArray[slot].imageView = *itTex->second.textureImageView; + rqArray[slot].imageLayout = vk::ImageLayout::eShaderReadOnlyOptimal; + } + } + }; + + for (uint32_t i = 0; i < copyCount; ++i) { + // Kick watchdog occasionally during large descriptor table fills. + if ((i % 128u) == 0u) { + lastFrameUpdateTime.store(std::chrono::steady_clock::now(), std::memory_order_relaxed); + } + fillSlot(i); + } + } + + if (writeCount > 0) { + vk::WriteDescriptorSet texArrayWrite{}; + texArrayWrite.dstSet = *rayQueryDescriptorSets[frameIndex]; + texArrayWrite.dstBinding = 6; + texArrayWrite.dstArrayElement = 0; + texArrayWrite.descriptorCount = writeCount; + texArrayWrite.descriptorType = vk::DescriptorType::eCombinedImageSampler; + texArrayWrite.pImageInfo = rqArray.data(); + writes.push_back(texArrayWrite); + } { + std::lock_guard lk(descriptorMutex); + device.updateDescriptorSets(writes, nullptr); + } + rayQueryDescriptorsWritten[frameIndex] = true; + rayQueryDescriptorsDirtyMask.fetch_and(~bitMask, std::memory_order_relaxed); + + // No per-frame or one-shot debug prints here; keep logs quiet in production. + + return true; + } catch (const std::exception& e) { + std::cerr << "Failed to update ray query descriptor sets: " << e.what() << std::endl; + return false; + } +} diff --git a/attachments/sync2_engine/renderer_rendering.cpp b/attachments/sync2_engine/renderer_rendering.cpp new file mode 100644 index 00000000..6ca4139b --- /dev/null +++ b/attachments/sync2_engine/renderer_rendering.cpp @@ -0,0 +1,3011 @@ +/* Copyright (c) 2025 Holochip Corporation + * + * SPDX-License-Identifier: Apache-2.0 + * + * Licensed under the Apache License, Version 2.0 the "License"; + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +#include "imgui/imgui.h" +#include "imgui_system.h" +#include "mesh_component.h" +#include "model_loader.h" +#include "renderer.h" +#include "transform_component.h" +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +// ===================== Culling helpers implementation ===================== + +Renderer::FrustumPlanes Renderer::extractFrustumPlanes(const glm::mat4& vp) { + // Work in row-major form for standard plane extraction by transposing GLM's column-major matrix + glm::mat4 m = glm::transpose(vp); + FrustumPlanes fp{}; + // Left : m[3] + m[0] + fp.planes[0] = m[3] + m[0]; + // Right : m[3] - m[0] + fp.planes[1] = m[3] - m[0]; + // Bottom : m[3] + m[1] + fp.planes[2] = m[3] + m[1]; + // Top : m[3] - m[1] + fp.planes[3] = m[3] - m[1]; + // Near : m[2] (matches Vulkan [0, 1] clip range) + fp.planes[4] = m[2]; + // Far : m[3] - m[2] + fp.planes[5] = m[3] - m[2]; + + // Normalize planes + for (auto& p : fp.planes) { + glm::vec3 n(p.x, p.y, p.z); + float len = glm::length(n); + if (len > 0.0f) { + p /= len; + } + } + return fp; +} + +void Renderer::transformAABB(const glm::mat4& M, + const glm::vec3& localMin, + const glm::vec3& localMax, + glm::vec3& outMin, + glm::vec3& outMax) { + // OBB (from model) to world AABB using center/extents and absolute 3x3 + const glm::vec3 c = 0.5f * (localMin + localMax); + const glm::vec3 e = 0.5f * (localMax - localMin); + + const glm::vec3 worldCenter = glm::vec3(M * glm::vec4(c, 1.0f)); + // Upper-left 3x3 + const glm::mat3 A = glm::mat3(M); + const glm::mat3 AbsA = glm::mat3(glm::abs(A[0]), glm::abs(A[1]), glm::abs(A[2])); + const glm::vec3 worldExtents = AbsA * e; // component-wise combination + + outMin = worldCenter - worldExtents; + outMax = worldCenter + worldExtents; +} + +bool Renderer::aabbIntersectsFrustum(const glm::vec3& worldMin, + const glm::vec3& worldMax, + const FrustumPlanes& frustum) { + // Use the p-vertex test against each plane; if outside any plane → culled + for (const auto& p : frustum.planes) { + const glm::vec3 n(p.x, p.y, p.z); + // Choose positive vertex (furthest in direction of normal) + glm::vec3 v{ + n.x >= 0.0f ? worldMax.x : worldMin.x, + n.y >= 0.0f ? worldMax.y : worldMin.y, + n.z >= 0.0f ? worldMax.z : worldMin.z + }; + + // If the most positive vertex is still on the negative side of the plane, + // then the entire box is on the negative side. + // Use a small epsilon to avoid numerical issues. + if (glm::dot(n, v) + p.w < -0.01f) { + return false; // completely outside + } + } + return true; +} + +// This file contains rendering-related methods from the Renderer class + +// Create swap chain +bool Renderer::createSwapChain() { + try { + // Query swap chain support + SwapChainSupportDetails swapChainSupport = querySwapChainSupport(physicalDevice); + + // Choose swap surface format, present mode, and extent + vk::SurfaceFormatKHR surfaceFormat = chooseSwapSurfaceFormat(swapChainSupport.formats); + vk::PresentModeKHR presentMode = chooseSwapPresentMode(swapChainSupport.presentModes); + + vk::Extent2D extent = chooseSwapExtent(swapChainSupport.capabilities); + + // Choose image count + uint32_t imageCount = swapChainSupport.capabilities.minImageCount + 1; + if (swapChainSupport.capabilities.maxImageCount > 0 && imageCount > swapChainSupport.capabilities.maxImageCount) { + imageCount = swapChainSupport.capabilities.maxImageCount; + } + // Create swap chain info + vk::SwapchainCreateInfoKHR createInfo{ + .surface = *surface, + .minImageCount = imageCount, + .imageFormat = surfaceFormat.format, + .imageColorSpace = surfaceFormat.colorSpace, + .imageExtent = extent, + .imageArrayLayers = 1, + .imageUsage = vk::ImageUsageFlagBits::eColorAttachment | vk::ImageUsageFlagBits::eTransferDst, + .preTransform = swapChainSupport.capabilities.currentTransform, + .compositeAlpha = vk::CompositeAlphaFlagBitsKHR::eOpaque, + .presentMode = presentMode, + .clipped = VK_TRUE, + .oldSwapchain = nullptr + }; + + // Find queue families + QueueFamilyIndices indices = findQueueFamilies(physicalDevice); + + std::array queueFamilyIndicesLoc = {indices.graphicsFamily.value(), indices.presentFamily.value()}; + + // Set sharing mode + if (indices.graphicsFamily != indices.presentFamily) { + createInfo.imageSharingMode = vk::SharingMode::eConcurrent; + createInfo.queueFamilyIndexCount = static_cast(queueFamilyIndicesLoc.size()); + createInfo.pQueueFamilyIndices = queueFamilyIndicesLoc.data(); + } else { + createInfo.imageSharingMode = vk::SharingMode::eExclusive; + createInfo.queueFamilyIndexCount = 0; + createInfo.pQueueFamilyIndices = nullptr; + } + + // Create swap chain + swapChain = vk::raii::SwapchainKHR(device, createInfo); + + // Get swap chain images + swapChainImages = swapChain.getImages(); + + // Swapchain images start in UNDEFINED layout; track per-image layout for correct barriers. + swapChainImageLayouts.assign(swapChainImages.size(), vk::ImageLayout::eUndefined); + + // Store swap chain format and extent + swapChainImageFormat = surfaceFormat.format; + swapChainExtent = extent; + + return true; + } catch (const std::exception& e) { + std::cerr << "Failed to create swap chain: " << e.what() << std::endl; + return false; + } +} + +// ===================== Planar reflections resources ===================== +bool Renderer::createReflectionResources(uint32_t width, uint32_t height) { + try { + destroyReflectionResources(); + reflections.clear(); + reflections.resize(MAX_FRAMES_IN_FLIGHT); + reflectionVPs.clear(); + reflectionVPs.resize(MAX_FRAMES_IN_FLIGHT, glm::mat4(1.0f)); + sampleReflectionVP = glm::mat4(1.0f); + + for (uint32_t i = 0; i < MAX_FRAMES_IN_FLIGHT; ++i) { + auto& rt = reflections[i]; + rt.width = width; + rt.height = height; + + // Color RT: use swapchain format to match existing PBR pipeline rendering formats + vk::Format colorFmt = swapChainImageFormat; + auto [colorImg, colorAlloc] = createImagePooled( + width, + height, + colorFmt, + vk::ImageTiling::eOptimal, + // Allow sampling in glass and blitting to swapchain for diagnostics + vk::ImageUsageFlagBits::eColorAttachment | vk::ImageUsageFlagBits::eSampled | vk::ImageUsageFlagBits::eTransferSrc, + vk::MemoryPropertyFlagBits::eDeviceLocal, + /*mipLevels*/ + 1, + vk::SharingMode::eExclusive, + {}); + rt.color = std::move(colorImg); + rt.colorAlloc = std::move(colorAlloc); + rt.colorView = createImageView(rt.color, colorFmt, vk::ImageAspectFlagBits::eColor, 1); + // Simple sampler for sampling reflection texture (no mips) + vk::SamplerCreateInfo sampInfo{.magFilter = vk::Filter::eLinear, .minFilter = vk::Filter::eLinear, .mipmapMode = vk::SamplerMipmapMode::eNearest, .addressModeU = vk::SamplerAddressMode::eClampToEdge, .addressModeV = vk::SamplerAddressMode::eClampToEdge, .addressModeW = vk::SamplerAddressMode::eClampToEdge, .minLod = 0.0f, .maxLod = 0.0f}; + rt.colorSampler = vk::raii::Sampler(device, sampInfo); + + // Depth RT + vk::Format depthFmt = findDepthFormat(); + auto [depthImg, depthAlloc] = createImagePooled( + width, + height, + depthFmt, + vk::ImageTiling::eOptimal, + vk::ImageUsageFlagBits::eDepthStencilAttachment, + vk::MemoryPropertyFlagBits::eDeviceLocal, + /*mipLevels*/ + 1, + vk::SharingMode::eExclusive, + {}); + rt.depth = std::move(depthImg); + rt.depthAlloc = std::move(depthAlloc); + rt.depthView = createImageView(rt.depth, depthFmt, vk::ImageAspectFlagBits::eDepth, 1); + } + + // One-time initialization: transition all per-frame reflection color images + // from UNDEFINED to SHADER_READ_ONLY_OPTIMAL so that the first frame can + // legally sample the "previous" frame's image. + if (!reflections.empty()) { + vk::CommandPoolCreateInfo poolInfo{ + .flags = vk::CommandPoolCreateFlagBits::eTransient | vk::CommandPoolCreateFlagBits::eResetCommandBuffer, + .queueFamilyIndex = queueFamilyIndices.graphicsFamily.value() + }; + vk::raii::CommandPool tempPool(device, poolInfo); + vk::CommandBufferAllocateInfo allocInfo{.commandPool = *tempPool, .level = vk::CommandBufferLevel::ePrimary, .commandBufferCount = 1}; + vk::raii::CommandBuffers cbs(device, allocInfo); + vk::raii::CommandBuffer& cb = cbs[0]; + cb.begin({.flags = vk::CommandBufferUsageFlagBits::eOneTimeSubmit}); + + std::vector barriers; + barriers.reserve(reflections.size()); + for (auto& rt : reflections) { + if (!!*rt.color) { + barriers.push_back(vk::ImageMemoryBarrier2{ + .srcStageMask = vk::PipelineStageFlagBits2::eTopOfPipe, + .srcAccessMask = vk::AccessFlagBits2::eNone, + .dstStageMask = vk::PipelineStageFlagBits2::eFragmentShader, + .dstAccessMask = vk::AccessFlagBits2::eShaderRead, + .oldLayout = vk::ImageLayout::eUndefined, + .newLayout = vk::ImageLayout::eShaderReadOnlyOptimal, + .srcQueueFamilyIndex = VK_QUEUE_FAMILY_IGNORED, + .dstQueueFamilyIndex = VK_QUEUE_FAMILY_IGNORED, + .image = *rt.color, + .subresourceRange = {vk::ImageAspectFlagBits::eColor, 0, 1, 0, 1} + }); + } + } + if (!barriers.empty()) { + vk::DependencyInfo depInfo{.imageMemoryBarrierCount = static_cast(barriers.size()), .pImageMemoryBarriers = barriers.data()}; + cb.pipelineBarrier2(depInfo); + } + cb.end(); + vk::SubmitInfo submit{.commandBufferCount = 1, .pCommandBuffers = &*cb}; + vk::raii::Fence fence(device, vk::FenceCreateInfo{}); { + std::lock_guard lock(queueMutex); + graphicsQueue.submit(submit, *fence); + } + vk::Result result = waitForFencesSafe(*fence, VK_TRUE); + if (result != vk::Result::eSuccess) { + std::cerr << "Error: Failed to wait for reflection resource fence: " << vk::to_string(result) << std::endl; + } + } + + return true; + } catch (const std::exception& e) { + std::cerr << "Failed to create reflection resources: " << e.what() << std::endl; + destroyReflectionResources(); + return false; + } +} + +void Renderer::destroyReflectionResources() { + for (auto& rt : reflections) { + rt.colorSampler = vk::raii::Sampler(nullptr); + rt.colorView = vk::raii::ImageView(nullptr); + rt.colorAlloc = nullptr; + rt.color = vk::raii::Image(nullptr); + rt.depthView = vk::raii::ImageView(nullptr); + rt.depthAlloc = nullptr; + rt.depth = vk::raii::Image(nullptr); + rt.width = rt.height = 0; + } +} + +void Renderer::renderReflectionPass(vk::raii::CommandBuffer& cmd, + const glm::vec4& planeWS, + CameraComponent* camera, + const std::vector& jobs) { + if (reflections.empty()) + return; + auto& rt = reflections[currentFrame]; + if (rt.width == 0 || rt.height == 0 || !*rt.colorView || !*rt.depthView) + return; + + // Transition reflection color to COLOR_ATTACHMENT_OPTIMAL (Sync2) + vk::ImageMemoryBarrier2 toColor2{ + .srcStageMask = vk::PipelineStageFlagBits2::eTopOfPipe, + .srcAccessMask = {}, + .dstStageMask = vk::PipelineStageFlagBits2::eColorAttachmentOutput, + .dstAccessMask = vk::AccessFlagBits2::eColorAttachmentWrite | vk::AccessFlagBits2::eColorAttachmentRead, + .oldLayout = vk::ImageLayout::eShaderReadOnlyOptimal, + .newLayout = vk::ImageLayout::eColorAttachmentOptimal, + .srcQueueFamilyIndex = VK_QUEUE_FAMILY_IGNORED, + .dstQueueFamilyIndex = VK_QUEUE_FAMILY_IGNORED, + .image = *rt.color, + .subresourceRange = {vk::ImageAspectFlagBits::eColor, 0, 1, 0, 1} + }; + // Transition reflection depth to DEPTH_STENCIL_ATTACHMENT_OPTIMAL (Sync2) + vk::ImageMemoryBarrier2 toDepth2{ + .srcStageMask = vk::PipelineStageFlagBits2::eTopOfPipe, + .srcAccessMask = {}, + .dstStageMask = vk::PipelineStageFlagBits2::eEarlyFragmentTests, + .dstAccessMask = vk::AccessFlagBits2::eDepthStencilAttachmentWrite | vk::AccessFlagBits2::eDepthStencilAttachmentRead, + .oldLayout = vk::ImageLayout::eUndefined, + .newLayout = vk::ImageLayout::eDepthAttachmentOptimal, + .srcQueueFamilyIndex = VK_QUEUE_FAMILY_IGNORED, + .dstQueueFamilyIndex = VK_QUEUE_FAMILY_IGNORED, + .image = *rt.depth, + .subresourceRange = {vk::ImageAspectFlagBits::eDepth, 0, 1, 0, 1} + }; + std::array preBarriers{toColor2, toDepth2}; + vk::DependencyInfo depInfoToColor{.imageMemoryBarrierCount = static_cast(preBarriers.size()), .pImageMemoryBarriers = preBarriers.data()}; + cmd.pipelineBarrier2(depInfoToColor); + + vk::RenderingAttachmentInfo colorAtt{ + .imageView = *rt.colorView, + .imageLayout = vk::ImageLayout::eColorAttachmentOptimal, + .loadOp = vk::AttachmentLoadOp::eClear, + .storeOp = vk::AttachmentStoreOp::eStore, + // Clear to black so scene content dominates reflections + .clearValue = vk::ClearValue{vk::ClearColorValue{std::array < float, 4 >{0.0f, 0.0f, 0.0f, 1.0f}}} + }; + vk::RenderingAttachmentInfo depthAtt{ + .imageView = *rt.depthView, + .imageLayout = vk::ImageLayout::eDepthStencilAttachmentOptimal, + .loadOp = vk::AttachmentLoadOp::eClear, + .storeOp = vk::AttachmentStoreOp::eDontCare, + .clearValue = vk::ClearValue{vk::ClearDepthStencilValue{1.0f, 0}} + }; + vk::RenderingInfo rinfo{ + .renderArea = vk::Rect2D({0, 0}, {rt.width, rt.height}), + .layerCount = 1, + .colorAttachmentCount = 1, + .pColorAttachments = &colorAtt, + .pDepthAttachment = &depthAtt + }; + cmd.beginRendering(rinfo); + // Compute mirrored view matrix about planeWS (default Y=0 plane) + glm::mat4 reflectM(1.0f); + // For Y=0 plane, reflection is simply flip Y + if (glm::length(glm::vec3(planeWS.x, planeWS.y, planeWS.z)) > 0.5f && fabsf(planeWS.y - 1.0f) < 1e-3f && fabsf(planeWS.x) < 1e-3f && fabsf(planeWS.z) < 1e-3f) { + reflectM[1][1] = -1.0f; + } else { + // General plane reflection matrix R = I - 2*n*n^T for normalized plane; ignore translation for now + glm::vec3 n = glm::normalize(glm::vec3(planeWS)); + glm::mat3 R = glm::mat3(1.0f) - 2.0f * glm::outerProduct(n, n); + reflectM = glm::mat4(R); + } + + glm::mat4 viewReflected = camera ? (camera->GetViewMatrix() * reflectM) : reflectM; + glm::mat4 projReflected = camera ? camera->GetProjectionMatrix() : glm::mat4(1.0f); + currentReflectionVP = projReflected * viewReflected; + currentReflectionPlane = planeWS; + if (currentFrame < reflectionVPs.size()) { + reflectionVPs[currentFrame] = currentReflectionVP; + } + + // Set viewport/scissor to reflection RT size + vk::Viewport rv(0.0f, 0.0f, static_cast(rt.width), static_cast(rt.height), 0.0f, 1.0f); + cmd.setViewport(0, rv); + vk::Rect2D rs({0, 0}, {rt.width, rt.height}); + cmd.setScissor(0, rs); + + // Draw opaque entities with mirrored view + // Use reflection-specific pipeline (cull none) to avoid mirrored winding issues. + if (!!*pbrReflectionGraphicsPipeline) { + cmd.bindPipeline(vk::PipelineBindPoint::eGraphics, *pbrReflectionGraphicsPipeline); + } else if (!!*pbrGraphicsPipeline) { + cmd.bindPipeline(vk::PipelineBindPoint::eGraphics, *pbrGraphicsPipeline); + } + + // Prepare frustum for mirrored view to allow culling + FrustumPlanes reflectFrustum = extractFrustumPlanes(currentReflectionVP); + + // Render all jobs (skip transparency) + for (const auto& job : jobs) { + Entity* entity = job.entity; + MeshComponent* meshComponent = job.meshComp; + EntityResources* entityRes = job.entityRes; + MeshResources* meshRes = job.meshRes; + + if (entityRes->cachedIsBlended) + continue; + + // Frustum culling for mirrored view + if (meshComponent->HasLocalAABB()) { + const glm::mat4 model = job.transformComp ? job.transformComp->GetModelMatrix() : glm::mat4(1.0f); + glm::vec3 wmin, wmax; + transformAABB(model, meshComponent->GetLocalAABBMin(), meshComponent->GetLocalAABBMax(), wmin, wmax); + if (!aabbIntersectsFrustum(wmin, wmax, reflectFrustum)) { + continue; // culled from reflection + } + } + + // Bind geometry + std::array buffers = {*meshRes->vertexBuffer, *entityRes->instanceBuffer}; + std::array offsets = {0, 0}; + cmd.bindVertexBuffers(0, buffers, offsets); + cmd.bindIndexBuffer(*meshRes->indexBuffer, 0, vk::IndexType::eUint32); + + // Populate UBO with mirrored view + clip plane and reflection flags + UniformBufferObject ubo{}; + if (job.transformComp) + ubo.model = job.transformComp->GetModelMatrix(); + else + ubo.model = glm::mat4(1.0f); + ubo.view = viewReflected; + ubo.proj = projReflected; + ubo.camPos = glm::vec4(camera ? camera->GetPosition() : glm::vec3(0), 1.0f); + ubo.reflectionPass = 1; + ubo.reflectionEnabled = 0; + ubo.reflectionVP = currentReflectionVP; + ubo.clipPlaneWS = planeWS; + // Ray query shadows in reflection pass + ubo.padding2 = enableRasterRayQueryShadows ? 1.0f : 0.0f; + + updateUniformBufferInternal(currentFrame, entity, entityRes, camera, ubo); + + // Bind descriptor set (PBR set 0) + cmd.bindDescriptorSets(vk::PipelineBindPoint::eGraphics, + *pbrPipelineLayout, + 0, + *entityRes->pbrDescriptorSets[currentFrame], + nullptr); + + // Push material properties + MaterialProperties mp = entityRes->cachedMaterialProps; + // Transmission suppressed during reflection pass via UBO (reflectionPass=1) + mp.transmissionFactor = 0.0f; + pushMaterialProperties(*cmd, mp); + + // Issue draw + uint32_t instanceCount = std::max(1u, static_cast(meshComponent->GetInstanceCount())); + cmd.drawIndexed(meshRes->indexCount, instanceCount, 0, 0, 0); + } + + cmd.endRendering(); + + // Transition reflection color to SHADER_READ_ONLY for sampling in main pass (Sync2) + vk::ImageMemoryBarrier2 toSample2{ + .srcStageMask = vk::PipelineStageFlagBits2::eColorAttachmentOutput, + .srcAccessMask = vk::AccessFlagBits2::eColorAttachmentWrite, + .dstStageMask = vk::PipelineStageFlagBits2::eFragmentShader, + .dstAccessMask = vk::AccessFlagBits2::eShaderRead, + .oldLayout = vk::ImageLayout::eColorAttachmentOptimal, + .newLayout = vk::ImageLayout::eShaderReadOnlyOptimal, + .srcQueueFamilyIndex = VK_QUEUE_FAMILY_IGNORED, + .dstQueueFamilyIndex = VK_QUEUE_FAMILY_IGNORED, + .image = *rt.color, + .subresourceRange = {vk::ImageAspectFlagBits::eColor, 0, 1, 0, 1} + }; + vk::DependencyInfo depInfoToSample{.imageMemoryBarrierCount = 1, .pImageMemoryBarriers = &toSample2}; + cmd.pipelineBarrier2(depInfoToSample); +} + +// Create image views +bool Renderer::createImageViews() { + try { + opaqueSceneColorImages.clear(); + opaqueSceneColorImageAllocations.clear(); + opaqueSceneColorImageViews.clear(); + opaqueSceneColorImageLayouts.clear(); + opaqueSceneColorSampler.clear(); + // Resize image views vector + swapChainImageViews.clear(); + swapChainImageViews.reserve(swapChainImages.size()); + + // Create image view info template (image will be set per iteration) + vk::ImageViewCreateInfo createInfo{ + .viewType = vk::ImageViewType::e2D, + .format = swapChainImageFormat, + .components = { + .r = vk::ComponentSwizzle::eIdentity, + .g = vk::ComponentSwizzle::eIdentity, + .b = vk::ComponentSwizzle::eIdentity, + .a = vk::ComponentSwizzle::eIdentity + }, + .subresourceRange = {.aspectMask = vk::ImageAspectFlagBits::eColor, .baseMipLevel = 0, .levelCount = 1, .baseArrayLayer = 0, .layerCount = 1} + }; + + // Create image view for each swap chain image + for (size_t i = 0; i < swapChainImages.size(); i++) { + createInfo.image = swapChainImages[i]; + swapChainImageViews.emplace_back(device, createInfo); + } + + return true; + } catch (const std::exception& e) { + std::cerr << "Failed to create image views: " << e.what() << std::endl; + return false; + } +} + +// Setup dynamic rendering +bool Renderer::setupDynamicRendering() { + try { + // Create color attachment + colorAttachments = { + vk::RenderingAttachmentInfo{ + .imageLayout = vk::ImageLayout::eColorAttachmentOptimal, + .loadOp = vk::AttachmentLoadOp::eClear, + .storeOp = vk::AttachmentStoreOp::eStore, + .clearValue = vk::ClearColorValue(std::array < float, 4 >{0.0f, 0.0f, 0.0f, 1.0f}) + } + }; + + // Create depth attachment + depthAttachment = vk::RenderingAttachmentInfo{ + .imageLayout = vk::ImageLayout::eDepthStencilAttachmentOptimal, + .loadOp = vk::AttachmentLoadOp::eClear, + .storeOp = vk::AttachmentStoreOp::eStore, + .clearValue = vk::ClearDepthStencilValue(1.0f, 0) + }; + + // Create rendering info + renderingInfo = vk::RenderingInfo{ + .renderArea = vk::Rect2D(vk::Offset2D(0, 0), swapChainExtent), + .layerCount = 1, + .colorAttachmentCount = static_cast(colorAttachments.size()), + .pColorAttachments = colorAttachments.data(), + .pDepthAttachment = &depthAttachment + }; + + return true; + } catch (const std::exception& e) { + std::cerr << "Failed to setup dynamic rendering: " << e.what() << std::endl; + return false; + } +} + +// Create command pool +bool Renderer::createCommandPool() { + try { + // Find queue families + QueueFamilyIndices queueFamilyIndicesLoc = findQueueFamilies(physicalDevice); + + // Create command pool info + vk::CommandPoolCreateInfo poolInfo{ + .flags = vk::CommandPoolCreateFlagBits::eResetCommandBuffer, + .queueFamilyIndex = queueFamilyIndicesLoc.graphicsFamily.value() + }; + + // Create command pool + commandPool = vk::raii::CommandPool(device, poolInfo); + + return true; + } catch (const std::exception& e) { + std::cerr << "Failed to create command pool: " << e.what() << std::endl; + return false; + } +} + +// Create command buffers +bool Renderer::createCommandBuffers() { + try { + // Resize command buffers vector + commandBuffers.clear(); + commandBuffers.reserve(MAX_FRAMES_IN_FLIGHT); + + // Create command buffer allocation info + vk::CommandBufferAllocateInfo allocInfo{ + .commandPool = *commandPool, + .level = vk::CommandBufferLevel::ePrimary, + .commandBufferCount = static_cast(MAX_FRAMES_IN_FLIGHT) + }; + + // Allocate command buffers + commandBuffers = vk::raii::CommandBuffers(device, allocInfo); + for (size_t i = 0; i < commandBuffers.size(); ++i) { + } + + return true; + } catch (const std::exception& e) { + std::cerr << "Failed to create command buffers: " << e.what() << std::endl; + return false; + } +} + +// Create sync objects +bool Renderer::createSyncObjects() { + try { + // Resize semaphores and fences vectors + imageAvailableSemaphores.clear(); + renderFinishedSemaphores.clear(); + inFlightFences.clear(); + + // Semaphores per swapchain image (indexed by imageIndex from acquireNextImage) + // The presentation engine holds semaphores until the image is re-acquired, so we need + // one semaphore per swapchain image to avoid reuse conflicts. See Vulkan spec: + // https://docs.vulkan.org/guide/latest/swapchain_semaphore_reuse.html + const auto semaphoreCount = static_cast(swapChainImages.size()); + imageAvailableSemaphores.reserve(semaphoreCount); + renderFinishedSemaphores.reserve(semaphoreCount); + + // Fences per frame-in-flight for CPU-GPU synchronization (indexed by currentFrame) + inFlightFences.reserve(MAX_FRAMES_IN_FLIGHT); + + // Create semaphore info + vk::SemaphoreCreateInfo semaphoreInfo{}; + + // Create semaphores per swapchain image (indexed by imageIndex for presentation sync) + for (uint32_t i = 0; i < semaphoreCount; i++) { + imageAvailableSemaphores.emplace_back(device, semaphoreInfo); + renderFinishedSemaphores.emplace_back(device, semaphoreInfo); + } + + // Create fences per frame-in-flight (indexed by currentFrame for CPU-GPU pacing) + vk::FenceCreateInfo fenceInfo{ + .flags = vk::FenceCreateFlagBits::eSignaled + }; + for (uint32_t i = 0; i < MAX_FRAMES_IN_FLIGHT; i++) { + inFlightFences.emplace_back(device, fenceInfo); + } + + // Ensure uploads timeline semaphore exists (created early in createLogicalDevice) + // No action needed here unless reinitializing after swapchain recreation. + return true; + } catch (const std::exception& e) { + std::cerr << "Failed to create sync objects: " << e.what() << std::endl; + return false; + } +} + +// Clean up swap chain +void Renderer::cleanupSwapChain() { + // Clean up depth resources + depthImageView = vk::raii::ImageView(nullptr); + depthImage = vk::raii::Image(nullptr); + depthImageAllocation = nullptr; + + // Clean up swap chain image views + swapChainImageViews.clear(); + + // Note: Keep descriptor pool alive here to ensure descriptor sets remain valid during swapchain recreation. + // descriptorPool is preserved; it will be managed during full renderer teardown. + + // Destroy reflection render targets if present + destroyReflectionResources(); + + // Clean up pipelines + graphicsPipeline = vk::raii::Pipeline(nullptr); + pbrGraphicsPipeline = vk::raii::Pipeline(nullptr); + lightingPipeline = vk::raii::Pipeline(nullptr); + + // Clean up pipeline layouts + pipelineLayout = vk::raii::PipelineLayout(nullptr); + pbrPipelineLayout = vk::raii::PipelineLayout(nullptr); + lightingPipelineLayout = vk::raii::PipelineLayout(nullptr); + + // Clean up sync objects (they need to be recreated with new swap chain image count) + imageAvailableSemaphores.clear(); + renderFinishedSemaphores.clear(); + inFlightFences.clear(); + + // Clean up swap chain + swapChain = vk::raii::SwapchainKHR(nullptr); +} + +// Recreate swap chain +void Renderer::recreateSwapChain() { + // Prevent background uploads worker from mutating descriptors while we rebuild + StopUploadsWorker(); + + // Block descriptor writes while we rebuild swapchain and descriptor pools + descriptorSetsValid.store(false, std::memory_order_relaxed); { + // Drop any deferred descriptor updates that target old descriptor sets + std::lock_guard lk(pendingDescMutex); + pendingDescOps.clear(); + descriptorRefreshPending.store(false, std::memory_order_relaxed); + } + + // Wait for all frames in flight to complete using the timeline + if (*frameTimeline) { + uint64_t waitValue = totalFrameCount.load(); + vk::SemaphoreWaitInfo waitInfo{ + .semaphoreCount = 1, + .pSemaphores = &*frameTimeline, + .pValues = &waitValue + }; + if (device.waitSemaphores(waitInfo, UINT64_MAX) != vk::Result::eSuccess) { + std::cerr << "Warning: Failed to wait for frameTimeline during swapchain recreation" << std::endl; + } + } + + // Wait for the device to be idle before recreating the swap chain + // External synchronization required (VVL): serialize against queue submits/present. + WaitIdle(); + + // Clean up old swap chain resources + cleanupSwapChain(); + + // Recreate swap chain and related resources + createSwapChain(); + createImageViews(); + setupDynamicRendering(); + createDepthResources(); + + // (Re)create reflection resources if enabled + if (enablePlanarReflections) { + uint32_t rw = std::max(1u, static_cast(static_cast(swapChainExtent.width) * reflectionResolutionScale)); + uint32_t rh = std::max(1u, static_cast(static_cast(swapChainExtent.height) * reflectionResolutionScale)); + createReflectionResources(rw, rh); + } + + // Recreate sync objects with correct sizing for new swap chain + createSyncObjects(); + + // Recreate off-screen opaque scene color and descriptor sets needed by transparent pass + createOpaqueSceneColorResources(); + createTransparentDescriptorSets(); + createTransparentFallbackDescriptorSets(); + + // Wait for all command buffers to complete before clearing resources + for (const auto& fence : inFlightFences) { + vk::Result result = waitForFencesSafe(*fence, VK_TRUE); + if (result != vk::Result::eSuccess) { + std::cerr << "Error: Failed to wait for fence before clearing resources: " << vk::to_string(result) << std::endl; + } + } + + // Clear all entity descriptor sets since they're now invalid (allocated from the old pool) + { + // Serialize descriptor frees against any other descriptor operations + std::lock_guard lk(descriptorMutex); + for (auto& kv : entityResources) { + auto& resources = kv.second; + resources.basicDescriptorSets.clear(); + resources.pbrDescriptorSets.clear(); + // Descriptor initialization flags must be reset because new descriptor sets + // will be allocated and only the current frame will be initialized at runtime. + resources.pbrUboBindingWritten.assign(MAX_FRAMES_IN_FLIGHT, false); + resources.basicUboBindingWritten.assign(MAX_FRAMES_IN_FLIGHT, false); + resources.pbrImagesWritten.assign(MAX_FRAMES_IN_FLIGHT, false); + resources.basicImagesWritten.assign(MAX_FRAMES_IN_FLIGHT, false); + resources.pbrFixedBindingsWritten.assign(MAX_FRAMES_IN_FLIGHT, false); + } + } + + // Clear ray query descriptor sets - they reference the old output image which will be destroyed + // Must clear before recreating to avoid descriptor set corruption + rayQueryDescriptorSets.clear(); + rayQueryDescriptorsWritten.clear(); + rayQueryDescriptorsDirtyMask.store(0u, std::memory_order_relaxed); + + // Destroy ray query output image resources - they're sized to old swapchain dimensions + rayQueryOutputImageView = vk::raii::ImageView(nullptr); + rayQueryOutputImage = vk::raii::Image(nullptr); + rayQueryOutputImageAllocation = nullptr; + + createGraphicsPipeline(); + createPBRPipeline(); + createLightingPipeline(); + createCompositePipeline(); + + // Recreate Forward+ specific pipelines/resources and resize tile buffers for new extent + if (useForwardPlus) { + createDepthPrepassPipeline(); + uint32_t tilesX = (swapChainExtent.width + forwardPlusTileSizeX - 1) / forwardPlusTileSizeX; + uint32_t tilesY = (swapChainExtent.height + forwardPlusTileSizeY - 1) / forwardPlusTileSizeY; + createOrResizeForwardPlusBuffers(tilesX, tilesY, forwardPlusSlicesZ); + } + + // Re-create command buffers to ensure fresh recording against new swapchain state + commandBuffers.clear(); + createCommandBuffers(); + currentFrame = 0; + + // Recreate ray query resources with new swapchain dimensions + // This must happen after descriptor pool is valid but before marking descriptor sets valid + if (rayQueryEnabled && accelerationStructureEnabled) { + if (!createRayQueryResources()) { + std::cerr << "Warning: Failed to recreate ray query resources after swapchain recreation\n"; + } + } + + // Recreate descriptor sets for all entities after swapchain/pipeline rebuild + for (const auto& kv : entityResources) { + const auto& entity = kv.first; + if (!entity) + continue; + auto meshComponent = entity->GetComponent(); + if (!meshComponent) + continue; + + std::string texturePath = meshComponent->GetTexturePath(); + // Fallback for basic pipeline: use baseColor when legacy path is empty + if (texturePath.empty()) { + const std::string& baseColor = meshComponent->GetBaseColorTexturePath(); + if (!baseColor.empty()) { + texturePath = baseColor; + } + } + // Recreate basic descriptor sets (ignore failures here to avoid breaking resize) + createDescriptorSets(entity, texturePath, false); + // Recreate PBR descriptor sets + createDescriptorSets(entity, texturePath, true); + } + + // Descriptor sets are now valid again + descriptorSetsValid.store(true, std::memory_order_relaxed); + + // Resume background uploads worker now that swapchain and descriptors are recreated + StartUploadsWorker(); +} + +void Renderer::prepareFrameUboTemplate(CameraComponent* camera) { + frameUboTemplate = UniformBufferObject{}; + if (!camera) return; + + frameUboTemplate.view = camera->GetViewMatrix(); + frameUboTemplate.proj = camera->GetProjectionMatrix(); + frameUboTemplate.proj[1][1] *= -1; // Flip Y for Vulkan + frameUboTemplate.camPos = glm::vec4(camera->GetPosition(), 1.0f); + + frameUboTemplate.lightCount = static_cast(lastFrameLightCount); + frameUboTemplate.exposure = std::clamp(this->exposure, 0.2f, 4.0f); + frameUboTemplate.gamma = this->gamma; + frameUboTemplate.screenDimensions = glm::vec2(swapChainExtent.width, swapChainExtent.height); + frameUboTemplate.nearZ = camera->GetNearPlane(); + frameUboTemplate.farZ = camera->GetFarPlane(); + frameUboTemplate.slicesZ = static_cast(forwardPlusSlicesZ); + + int outputIsSRGB = (swapChainImageFormat == vk::Format::eR8G8B8A8Srgb || + swapChainImageFormat == vk::Format::eB8G8R8A8Srgb) ? 1 : 0; + frameUboTemplate.padding0 = outputIsSRGB; + // Raster PBR shader uses padding1 as the Forward+ enable flag. + // 0 = disabled (always use global light loop), non-zero = enabled (use culled tile lists). + frameUboTemplate.padding1 = useForwardPlus ? 1.0f : 0.0f; + frameUboTemplate.padding2 = enableRasterRayQueryShadows ? 1.0f : 0.0f; + + bool reflReady = false; + if (enablePlanarReflections && !reflections.empty()) { + const uint32_t count = static_cast(reflections.size()); + const uint32_t prev = (currentFrame + count - 1u) % count; + auto& rtPrev = reflections[prev]; + reflReady = (!!*rtPrev.colorView) && (!!*rtPrev.colorSampler); + } + frameUboTemplate.reflectionEnabled = reflReady ? 1 : 0; + frameUboTemplate.reflectionVP = sampleReflectionVP; + frameUboTemplate.clipPlaneWS = currentReflectionPlane; + frameUboTemplate.reflectionIntensity = std::clamp(reflectionIntensity, 0.0f, 2.0f); + frameUboTemplate.enableRayQueryReflections = enableRayQueryReflections ? 1 : 0; + frameUboTemplate.enableRayQueryTransparency = enableRayQueryTransparency ? 1 : 0; + + // Ray-query shared buffers are also used by raster PBR when doing ray-query shadows. + // Populate counts so shaders can bounds-check even when running in raster mode. + frameUboTemplate.geometryInfoCount = static_cast(geometryInfoCountCPU); + frameUboTemplate.materialCount = static_cast(materialCountCPU); +} + +// Update uniform buffer +void Renderer::updateUniformBuffer(uint32_t currentImage, Entity* entity, EntityResources* entityRes, CameraComponent* camera, TransformComponent* tc) { + if (!entityRes) { + return; + } + + // Get transform component + auto transformComponent = tc ? tc : (entity ? entity->GetComponent() : nullptr); + if (!transformComponent) { + return; + } + + // Create uniform buffer object + UniformBufferObject ubo{}; + ubo.model = transformComponent->GetModelMatrix(); + ubo.view = camera->GetViewMatrix(); + ubo.proj = camera->GetProjectionMatrix(); + ubo.proj[1][1] *= -1; // Flip Y for Vulkan + + // Continue with the rest of the uniform buffer setup + updateUniformBufferInternal(currentImage, entity, entityRes, camera, ubo); +} + +// Overloaded version that accepts a custom transform matrix +void Renderer::updateUniformBuffer(uint32_t currentImage, Entity* entity, EntityResources* entityRes, CameraComponent* camera, const glm::mat4& customTransform) { + if (!entityRes) return; + // Create the uniform buffer object with custom transform + UniformBufferObject ubo{}; + ubo.model = customTransform; + ubo.view = camera->GetViewMatrix(); + ubo.proj = camera->GetProjectionMatrix(); + ubo.proj[1][1] *= -1; // Flip Y for Vulkan + + // Continue with the rest of the uniform buffer setup + updateUniformBufferInternal(currentImage, entity, entityRes, camera, ubo); +} + +// Internal helper function to complete uniform buffer setup +void Renderer::updateUniformBufferInternal(uint32_t currentImage, Entity* entity, EntityResources* entityRes, CameraComponent* camera, UniformBufferObject& ubo) { + if (!entityRes) { + return; + } + + // Use frame template for most fields + UniformBufferObject finalUbo = frameUboTemplate; + finalUbo.model = ubo.model; + + // For reflection pass, we must override view/proj/reflection flags + if (ubo.reflectionPass == 1) { + finalUbo.view = ubo.view; + finalUbo.proj = ubo.proj; + finalUbo.reflectionPass = 1; + finalUbo.reflectionEnabled = 0; + finalUbo.reflectionVP = ubo.reflectionVP; + finalUbo.clipPlaneWS = ubo.clipPlaneWS; + finalUbo.padding2 = ubo.padding2; + } + + // Copy to uniform buffer (guard against null mapped pointer) + void* dst = entityRes->uniformBuffersMapped[currentImage]; + if (!dst) { + std::cerr << "Warning: UBO mapped ptr null for entity '" << (entity ? entity->GetName() : "unknown") << "' frame " << currentImage << std::endl; + return; + } + std::memcpy(dst, &finalUbo, sizeof(UniformBufferObject)); +} + +void Renderer::ensureEntityMaterialCache(Entity* entity, EntityResources& res) { + if (!entity) + return; + + if (res.materialCacheValid) + return; + + res.materialCacheValid = true; + res.cachedMaterial = nullptr; + res.cachedIsBlended = false; + res.cachedIsGlass = false; + res.cachedIsLiquid = false; + + // Defaults represent the common case (no explicit material); textures come from descriptor bindings. + MaterialProperties mp{}; + // Sensible defaults for entities without explicit material + mp.baseColorFactor = glm::vec4(1.0f); + mp.metallicFactor = 0.0f; + mp.roughnessFactor = 1.0f; + mp.baseColorTextureSet = 0; + mp.physicalDescriptorTextureSet = 0; + mp.normalTextureSet = -1; + mp.occlusionTextureSet = -1; + mp.emissiveTextureSet = -1; + mp.alphaMask = 0.0f; + mp.alphaMaskCutoff = 0.5f; + mp.emissiveFactor = glm::vec3(0.0f); + mp.emissiveStrength = 1.0f; + mp.transmissionFactor = 0.0f; + mp.useSpecGlossWorkflow = 0; + mp.glossinessFactor = 0.0f; + mp.specularFactor = glm::vec3(1.0f); + mp.ior = 1.5f; + mp.hasEmissiveStrengthExtension = 0; + + if (modelLoader) { + const std::string& entityName = entity->GetName(); + const size_t tagPos = entityName.find("_Material_"); + if (tagPos != std::string::npos) { + const size_t afterTag = tagPos + std::string("_Material_").size(); + if (afterTag < entityName.length()) { + // Entity name format: "modelName_Material__" + const std::string remainder = entityName.substr(afterTag); + const size_t nextUnderscore = remainder.find('_'); + if (nextUnderscore != std::string::npos && nextUnderscore + 1 < remainder.length()) { + const std::string materialName = remainder.substr(nextUnderscore + 1); + if (const Material* material = modelLoader->GetMaterial(materialName)) { + res.cachedMaterial = material; + res.cachedIsGlass = material->isGlass; + res.cachedIsLiquid = material->isLiquid; + + // Base factors + mp.baseColorFactor = glm::vec4(material->albedo, material->alpha); + mp.metallicFactor = material->metallic; + mp.roughnessFactor = material->roughness; + + // Texture set flags (-1 = no texture) + mp.baseColorTextureSet = material->albedoTexturePath.empty() ? -1 : 0; + // physical descriptor: MR or SpecGloss + if (material->useSpecularGlossiness) { + mp.useSpecGlossWorkflow = 1; + mp.physicalDescriptorTextureSet = material->specGlossTexturePath.empty() ? -1 : 0; + mp.glossinessFactor = material->glossinessFactor; + mp.specularFactor = material->specularFactor; + } else { + mp.useSpecGlossWorkflow = 0; + mp.physicalDescriptorTextureSet = material->metallicRoughnessTexturePath.empty() ? -1 : 0; + } + mp.normalTextureSet = material->normalTexturePath.empty() ? -1 : 0; + mp.occlusionTextureSet = material->occlusionTexturePath.empty() ? -1 : 0; + mp.emissiveTextureSet = material->emissiveTexturePath.empty() ? -1 : 0; + + // Emissive and transmission/IOR + mp.emissiveFactor = material->emissive; + mp.emissiveStrength = material->emissiveStrength; + // Heuristic: consider emissive strength extension present when strength != 1.0 + mp.hasEmissiveStrengthExtension = (std::abs(material->emissiveStrength - 1.0f) > 1e-6f) ? 1 : 0; + mp.transmissionFactor = material->transmissionFactor; + mp.ior = material->ior; + + // Alpha mask handling + mp.alphaMask = (material->alphaMode == "MASK") ? 1.0f : 0.0f; + mp.alphaMaskCutoff = material->alphaCutoff; + + // Blended classification (opaque materials stay in the opaque pass) + const bool alphaBlend = (material->alphaMode == "BLEND"); + const bool highTransmission = (material->transmissionFactor > 0.2f); + res.cachedIsBlended = alphaBlend || highTransmission || res.cachedIsGlass || res.cachedIsLiquid; + } + } + } + } + } + + res.cachedMaterialProps = mp; +} + +// Render the scene (unique_ptr container overload) +// Convert to a raw-pointer snapshot so callers can safely release their container locks. +void Renderer::Render(const std::vector>& entities, CameraComponent* camera, ImGuiSystem* imguiSystem) { + std::vector snapshot; + snapshot.reserve(entities.size()); + for (const auto& uptr : entities) { + snapshot.push_back(uptr.get()); + } + Render(snapshot, camera, imguiSystem); +} + +// Render the scene (raw pointer snapshot overload) +void Renderer::Render(const std::vector& entities, CameraComponent* camera, ImGuiSystem* imguiSystem) { + auto startRender = std::chrono::steady_clock::now(); + static uint64_t renderCallCount = 0; + // 1. Initial Load State Machine Management + // Keep the fullscreen overlay until geometry preallocation is done. + const InternalLoadingState currentLoadState = currentInternalLoadingState.load(std::memory_order_relaxed); + const bool isParsing = (currentLoadState == InternalLoadingState::Parsing); + const bool isPreallocating = (currentLoadState == InternalLoadingState::Preallocating); + const bool isPhysicsInit = (currentLoadState == InternalLoadingState::PhysicsInit); + const bool isPlaying = (currentLoadState == InternalLoadingState::Play); + + bool loadDone = isPlaying; + + if (isPlaying) { + initialLoadComplete.store(true, std::memory_order_relaxed); + } else { + // Determine loading UI text based on phase + // Only call SetLoadingPhase if it's NOT already correct, to avoid progress reset. + // SetLoadingPhase handles the check itself now. + if (isParsing) SetLoadingPhase(LoadingPhase::Scene); + else if (isPreallocating) SetLoadingPhase(LoadingPhase::Scene); // Still geometry + else if (isPhysicsInit) SetLoadingPhase(LoadingPhase::Physics); + } + + // 1. Determine next frame value and wait for the previous frame slot to be ready using our frame timeline + // This replaces inFlightFences with a single monotonic counter and ensures proper CPU-GPU pacing. + const uint64_t nextFrameCount = totalFrameCount.load() + 1; + const uint64_t waitValue = (nextFrameCount > MAX_FRAMES_IN_FLIGHT) ? + ((nextFrameCount - MAX_FRAMES_IN_FLIGHT) * 10 + TimelineMilestones::eGpuWorkFinished) : 0; + + if (waitValue > 0) { + auto waitStart = std::chrono::steady_clock::now(); + watchdogProgressLabel.store("Render: wait frameTimeline", std::memory_order_relaxed); + vk::SemaphoreWaitInfo waitInfo{ + .semaphoreCount = 1, + .pSemaphores = &*frameTimeline, + .pValues = &waitValue + }; + // Always use a bounded timeout so the render loop never blocks forever. + uint64_t timeoutNs = 1'000'000'000; // 1 second + auto waitResult = device.waitSemaphores(waitInfo, timeoutNs); + + if (waitResult == vk::Result::eTimeout) { + // GPU is too busy; skip this frame to keep the UI/engine loop responsive. + // IMPORTANT: Do NOT advance or signal the frame timeline here. Only render UI and try again next frame. + if (renderCallCount % 10 == 0) { + } + if (imguiSystem) { + imguiSystem->EndFrameWithoutRendering(); + } + return; + } + + if (waitResult != vk::Result::eSuccess) { + std::cerr << "Error: Failed to wait for frameTimeline! Result: " << vk::to_string(waitResult) << std::endl; + if (imguiSystem) { + imguiSystem->EndFrameWithoutRendering(); + } + return; + } + } + + // Officially move to the next frame once synchronization is confirmed. + totalFrameCount++; + renderCallCount++; + // Ensure currentFrame slot index is perfectly in sync with totalFrameCount + currentFrame = (totalFrameCount.load() - 1) % MAX_FRAMES_IN_FLIGHT; + currentTimelineValue = totalFrameCount.load() * 10; + + if (renderCallCount % 10 == 1) { + uint32_t texSched = textureTasksScheduled.load(std::memory_order_relaxed); + uint32_t texDone = textureTasksCompleted.load(std::memory_order_relaxed); + + // Update UI progress for background texture loading + if (texSched > 0 && texDone < texSched) { + // Automatically transition to Textures phase if we have jobs and aren't finishing up + if (GetLoadingPhase() == LoadingPhase::Scene || GetLoadingPhase() == LoadingPhase::Physics) { + SetLoadingPhase(LoadingPhase::Textures); + } + + if (GetLoadingPhase() == LoadingPhase::Textures) { + float progress = static_cast(texDone) / static_cast(texSched); + loadingPhaseProgress.store(progress, std::memory_order_relaxed); + } + } + } + + static uint64_t postLoadFrameCount = 0; + bool isPostLoad = false; + if (loadDone) { + postLoadFrameCount++; + isPostLoad = true; + } + + // 3. Update watchdog timestamp to prove frame is progressing + KickWatchdog(); + watchdogProgressLabel.store("Render: frame begin", std::memory_order_relaxed); + + // Suppress watchdog during heavy loading or while draining the preallocation queue + const bool stillPreallocating = pendingEntityPreallocQueued.load(std::memory_order_relaxed); + if (IsLoading() || stillPreallocating) { + watchdogSuppressed.store(true, std::memory_order_relaxed); + } else if (!asBuildRequested.load(std::memory_order_relaxed)) { + // Only unsuppress if no background heavy tasks are pending + watchdogSuppressed.store(false, std::memory_order_relaxed); + } + + // Execute any pending GPU uploads + watchdogProgressLabel.store("Render: ProcessPendingMeshUploads", std::memory_order_relaxed); + auto pmuStart = std::chrono::steady_clock::now(); + ProcessPendingMeshUploads(); + + // Drain some pending texture jobs every frame to guarantee forward progress + watchdogProgressLabel.store("Render: ProcessPendingTextureJobs", std::memory_order_relaxed); + ProcessPendingTextureJobs(/*maxJobs=*/16, /*includeCritical=*/true, /*includeNonCritical=*/true); + + // Execute pending entity preallocations with a time budget. + // Chunked preallocation: at most 1 entity per frame to keep UI responsive. + if (pendingEntityPreallocQueued.load(std::memory_order_relaxed)) { + watchdogProgressLabel.store("Render: ProcessPendingEntityPreallocations", std::memory_order_relaxed); + auto budgetStart = std::chrono::steady_clock::now(); + + // Chunked preallocation: 1 entity per frame to keep UI responsive + ProcessPendingEntityPreallocations(); + KickWatchdog(); + } + + // Check if we just finished the initial geometry preallocation AND all data is on GPU. + // This must be outside the 'if (pendingEntityPreallocQueued)' block to ensure the transition + // is evaluated correctly even if the queue becomes empty and uploads are still in-flight. + if (currentInternalLoadingState.load(std::memory_order_relaxed) == InternalLoadingState::Preallocating && + !pendingEntityPreallocQueued.load(std::memory_order_relaxed) && + !IsSceneLoaderActive() && !HasPendingMeshUploads()) { + currentInternalLoadingState.store(InternalLoadingState::PhysicsInit, std::memory_order_release); + SetLoadingPhase(LoadingPhase::Physics); + // Trigger the first AS build now that all geometry is ready + asDevOverrideAllowRebuild = true; + RequestAccelerationStructureBuild("Initial geometry preallocation complete"); + } + + // Transition from PhysicsInit to Play once physics/base textures are mostly ready + if (currentInternalLoadingState.load() == InternalLoadingState::PhysicsInit) { + static int physicsInitFrames = 0; + if (++physicsInitFrames > 10) { + // Wait for AS build if requested, before moving to Play + if (!asBuildRequested.load(std::memory_order_acquire)) { + MarkInitialLoadComplete(); + SetLoading(false); + currentInternalLoadingState.store(InternalLoadingState::Play, std::memory_order_release); + } else { + // If AS build is pending, show the AccelerationStructures phase + if (GetLoadingPhase() != LoadingPhase::AccelerationStructures) { + SetLoadingPhase(LoadingPhase::AccelerationStructures); + } + } + } + } + + // Lock shared resources for the remainder of the render call + // (After preallocation and uploads are processed to avoid self-deadlocks) + std::shared_lock entityLock(entityResourcesMutex); + std::shared_lock meshLock(meshResourcesMutex); + + if (memoryPool) + memoryPool->setRenderingActive(true); + struct RenderingStateGuard { + MemoryPool* pool; + explicit RenderingStateGuard(MemoryPool* p) : pool(p) { + } + ~RenderingStateGuard() { + if (pool) + pool->setRenderingActive(false); + } + } guard(memoryPool.get()); + + // Track if ray query rendered successfully this frame to skip rasterization code path + bool rayQueryRenderedThisFrame = false; + + // --- Extract lights for the frame --- + // Build a single light list once per frame (emissive lights only for this scene) + std::vector lightsSubset; + if (loadDone && camera && !staticLights.empty()) { + lightsSubset.reserve(std::min(staticLights.size(), static_cast(MAX_ACTIVE_LIGHTS))); + for (const auto& L : staticLights) { + // Include all lights (Directional, Point, Emissive) up to the limit + lightsSubset.push_back(L); + if (lightsSubset.size() >= MAX_ACTIVE_LIGHTS) + break; + } + } + lastFrameLightCount = static_cast(lightsSubset.size()); + if (loadDone && camera && !lightsSubset.empty()) { + updateLightStorageBuffer(currentFrame, lightsSubset, camera); + } + + // Pre-calculate frame-constant UBO data + if (loadDone && camera) { + prepareFrameUboTemplate(camera); + } + + // 2. Improved Garbage Collection using Timeline Semaphore + // Instead of counting frames, we check if the GPU has reached the timeline value + // from when the resource was last used. + { + uint64_t gpuCompletedValue = frameTimeline.getCounterValue(); + auto it = pendingASDeletions.begin(); + while (it != pendingASDeletions.end()) { + // Check if the GPU has finished using this resource slot + if (it->timelineValue <= gpuCompletedValue) { + // Safe to delete + it = pendingASDeletions.erase(it); + } else { + ++it; + } + } + } + watchdogProgressLabel.store("Render: after pendingASDeletions", std::memory_order_relaxed); + + // Opportunistically request AS rebuild when more meshes become ready than in the last built AS. + // This makes the TLAS grow as streaming/allocations complete, then settle (no rebuild spam). + // NOTE: This scan can be relatively heavy and is not needed for the default startup path. + // Only run it when opportunistic rebuilds are enabled. + // While loading, allow opportunistic AS rebuild scanning even if the user-facing toggle is off. + // This prevents nondeterministic “missing outdoor props” across app restarts when the first TLAS + // build happens before all entities exist. + if (rayQueryEnabled && accelerationStructureEnabled && (asOpportunisticRebuildEnabled || IsLoading())) { + // Only scan readiness periodically or during loading to avoid high CPU overhead + static auto lastScanTime = std::chrono::steady_clock::now(); + auto now = std::chrono::steady_clock::now(); + const auto currentLoadState = currentInternalLoadingState.load(std::memory_order_relaxed); + bool shouldScan = false; + if (currentLoadState == InternalLoadingState::PhysicsInit) { + // Allow scan in PhysicsInit with 1s interval + if (std::chrono::duration_cast(now - lastScanTime).count() > 1000) { + shouldScan = true; + } + } else if (currentLoadState == InternalLoadingState::Play) { + // 5s cooldown in Play state per specification + if (std::chrono::duration_cast(now - lastScanTime).count() >= 5000) { + shouldScan = true; + } + } + + // Disable entirely during Parsing/Preallocating + if (currentLoadState == InternalLoadingState::Parsing || currentLoadState == InternalLoadingState::Preallocating) { + shouldScan = false; + } + + // Skip expensive scan while heavy background preallocation is in progress + const bool stillPreallocating = pendingEntityPreallocQueued.load(std::memory_order_relaxed); + if (shouldScan && !stillPreallocating) { + lastScanTime = now; + watchdogProgressLabel.store("Render: AS readiness scan", std::memory_order_relaxed); + size_t readyRenderableCount = 0; + size_t readyUniqueMeshCount = 0; { + auto lastKick = std::chrono::steady_clock::now(); + auto kickWatchdog = [&]() { + auto now = std::chrono::steady_clock::now(); + if (now - lastKick > std::chrono::milliseconds(200)) { + lastFrameUpdateTime.store(now, std::memory_order_relaxed); + lastKick = now; + } + }; + + uint32_t processedInScan = 0; + std::unordered_map meshToBLASProbe; + for (Entity* e : entities) { + if (++processedInScan % 100 == 0) kickWatchdog(); + if (!e || !e->IsActive()) + continue; + // In Ray Query static-only mode, ignore dynamic/animated entities for readiness + if (IsRayQueryStaticOnly()) { + const std::string& nm = e->GetName(); + if (nm.find("_AnimNode_") != std::string::npos) + continue; + if (!nm.empty() && nm.rfind("Ball_", 0) == 0) + continue; + } + auto meshComp = e->GetComponent(); + if (!meshComp) + continue; + try { + auto it = meshResources.find(meshComp); + if (it == meshResources.end()) + continue; + const auto& res = it->second; + // STRICT readiness: uploads must be finished (staging sizes zero) + if (res.vertexBufferSizeBytes != 0 || res.indexBufferSizeBytes != 0) + continue; + if (!*res.vertexBuffer || !*res.indexBuffer) + continue; + if (res.indexCount == 0) + continue; + } catch (...) { + continue; + } + readyRenderableCount++; + if (meshToBLASProbe.find(meshComp) == meshToBLASProbe.end()) { + meshToBLASProbe[meshComp] = static_cast(meshToBLASProbe.size()); + } + } + readyUniqueMeshCount = meshToBLASProbe.size(); + } + + // Gate rebuilds with a readiness delta in Play state. + // During PhysicsInit, use a very low threshold (1) to ensure the scene "fills in" before gameplay starts. + const size_t deltaThreshold = (currentLoadState == InternalLoadingState::Play) ? 20 : 1; + + if ((!asFrozen || IsLoading()) && (readyUniqueMeshCount >= lastASBuiltBLASCount + deltaThreshold) && !asBuildRequested.load(std::memory_order_relaxed)) { + std::cout << "AS rebuild requested: counts increased (built instances=" << lastASBuiltInstanceCount + << ", ready instances=" << readyRenderableCount + << ", built meshes=" << lastASBuiltBLASCount + << ", ready meshes=" << readyUniqueMeshCount + << ", threshold=" << deltaThreshold << ")\n"; + RequestAccelerationStructureBuild("counts increased"); + } + + // Post-load full scene repair + if (currentLoadState == InternalLoadingState::Play && !asBuildRequested.load(std::memory_order_relaxed)) { + const size_t targetInstances = readyRenderableCount; + if (targetInstances > 0 && lastASBuiltInstanceCount < static_cast(static_cast(targetInstances) * 0.95)) { + asDevOverrideAllowRebuild = true; + std::cout << "AS rebuild requested: post-load full build repair\n"; + RequestAccelerationStructureBuild("post-load full build"); + } + } + } + } + + // If in Ray Query static-only mode and TLAS not yet built post-load, request a one-time build now. + // (Does not require a readiness scan.) + if (rayQueryEnabled&& accelerationStructureEnabled && currentRenderMode + == + RenderMode::RayQuery&& IsRayQueryStaticOnly() && + !IsLoading() && + !*tlasStructure.handle && !asBuildRequested.load(std::memory_order_relaxed) + ) { + RequestAccelerationStructureBuild("static-only initial build"); + } + + // Check if acceleration structure build was requested (e.g., after scene loading or counts grew) + // Build at this safe frame point to avoid threading issues + // Defer building for a few frames after loading to allow initial descriptor/UBO updates to settle + bool requested = asBuildRequested.load(std::memory_order_acquire) && (!loadDone || postLoadFrameCount > 5); + watchdogProgressLabel.store("Render: AS build request check", std::memory_order_relaxed); + if (renderCallCount % 100 == 1 && requested) { + } + if (requested) { + static bool firstLog = true; + if (firstLog) { + firstLog = false; + } + watchdogProgressLabel.store("Render: AS build request handling", std::memory_order_relaxed); + + // Defer TLAS/BLAS build while the scene loader is still in Parsing/Preallocating state + // to avoid partial builds. We allow builds to proceed once in PhysicsInit so the + // initial TLAS can be built before moving to Play. + const auto currentLoadState = currentInternalLoadingState.load(std::memory_order_relaxed); + if (currentLoadState == InternalLoadingState::Parsing || + currentLoadState == InternalLoadingState::Preallocating) { + // Defer + if (renderCallCount % 100 == 1) { + } + } else if (asFrozen && !asDevOverrideAllowRebuild && !IsLoading()) { + // Ignore + std::cout << "AS rebuild request ignored (frozen). Reason: " << lastASBuildRequestReason << "\n"; + asBuildRequested.store(false, std::memory_order_release); + asBuildRequestStartNs.store(0, std::memory_order_relaxed); + watchdogSuppressed.store(false, std::memory_order_relaxed); + } else { + // Gate initial build until readiness is high enough to represent the full scene + size_t totalRenderableEntities = 0; + size_t readyRenderableCount = 0; + size_t readyUniqueMeshCount = 0; + + // OPTIMIZATION: Only do the full O(N) scan every 30 frames or if explicitly requested post-load + static uint64_t lastScanFrame = 0; + static size_t cachedTotal = 0; + static size_t cachedReady = 0; + static size_t cachedMeshes = 0; + bool forceScan = (lastScanFrame == 0); + bool isInitialPostLoad = (!lastASBuildRequestReason.empty() && + (lastASBuildRequestReason.find("Scene loading complete") != std::string::npos || + lastASBuildRequestReason.find("Initial geometry preallocation complete") != std::string::npos)); + + if (forceScan || (totalFrameCount % 30 == 0) || isInitialPostLoad) { + size_t missingMeshResources = 0; + size_t pendingUploadsCount = 0; + size_t nullBuffersCount = 0; + size_t zeroIndicesCount = 0; { + auto lastKick = std::chrono::steady_clock::now(); + auto kickWatchdog = [&]() { + auto now = std::chrono::steady_clock::now(); + if (now - lastKick > std::chrono::milliseconds(200)) { + lastFrameUpdateTime.store(now, std::memory_order_relaxed); + lastKick = now; + } + }; + std::map meshToBLASProbe; + for (Entity* e : entities) { + kickWatchdog(); + if (!e || !e->IsActive()) + continue; + // In Ray Query static-only mode, ignore dynamic/animated entities for totals/readiness + if (IsRayQueryStaticOnly()) { + const std::string& nm = e->GetName(); + if (nm.find("_AnimNode_") != std::string::npos) + continue; + if (!nm.empty() && nm.rfind("Ball_", 0) == 0) + continue; + } + auto meshComp = e->GetComponent(); + if (!meshComp) + continue; + totalRenderableEntities++; + try { + auto it = meshResources.find(meshComp); + if (it == meshResources.end()) { + missingMeshResources++; + continue; + } + const auto& res = it->second; + // STRICT readiness here too: uploads finished + if (res.vertexBufferSizeBytes != 0 || res.indexBufferSizeBytes != 0) { + pendingUploadsCount++; + continue; + } + if (!*res.vertexBuffer || !*res.indexBuffer) { + nullBuffersCount++; + continue; + } + if (res.indexCount == 0) { + zeroIndicesCount++; + continue; + } + } catch (...) { + continue; + } + readyRenderableCount++; + if (meshToBLASProbe.find(meshComp) == meshToBLASProbe.end()) { + meshToBLASProbe[meshComp] = static_cast(meshToBLASProbe.size()); + } + } + readyUniqueMeshCount = meshToBLASProbe.size(); + } + cachedTotal = totalRenderableEntities; + cachedReady = readyRenderableCount; + cachedMeshes = readyUniqueMeshCount; + lastScanFrame = totalFrameCount; + } else { + totalRenderableEntities = cachedTotal; + readyRenderableCount = cachedReady; + readyUniqueMeshCount = cachedMeshes; + } + + const double readiness = (totalRenderableEntities > 0) ? static_cast(readyRenderableCount) / static_cast(totalRenderableEntities) : 0.0; + double buildThreshold = 0.95; // prefer building when ~full scene is ready + // If the build was explicitly requested after scene loading, lower the bar to avoid deadlock + // on large scenes where uploads may still be finishing. + if (isInitialPostLoad) { + buildThreshold = 0.0; // Force build immediately after loading is done + asDevOverrideAllowRebuild = true; + } else if (!lastASBuildRequestReason.empty() && lastASBuildRequestReason.find("Scene loading complete") != std::string::npos) { + buildThreshold = 0.10; // build with whatever is ready; we will rebuild/refit as more arrives + } + + // Bounded deferral: avoid getting stuck forever waiting for perfect readiness. + // After a short timeout from the original request, build with the best available data. + const uint64_t reqNs = asBuildRequestStartNs.load(std::memory_order_relaxed); + const uint64_t nowNs = std::chrono::steady_clock::now().time_since_epoch().count(); + const double maxDeferralSeconds = 5.0; // tighten to kick off first build faster on large scenes + const bool deferralTimedOut = (reqNs != 0) && (nowNs > reqNs) && + (static_cast(nowNs - reqNs) / 1'000'000'000.0) >= maxDeferralSeconds; + + // Rate limit AS rebuilds to avoid CPU/GPU starvation. + // Use both time-based cooldown and readiness-based thresholds. + auto currentTime = std::chrono::steady_clock::now(); + const double minRebuildInterval = IsLoading() ? 5.0 : 2.0; // conservative while loading + const bool intervalPassed = std::chrono::duration(currentTime - lastASBuildTime).count() >= minRebuildInterval; + + // Delta-based gate: only rebuild if a significant number of new meshes are ready + const size_t readyDeltaThreshold = IsLoading() ? 20 : 5; + const bool significantDelta = (readyUniqueMeshCount >= lastBuiltUniqueMeshCount + readyDeltaThreshold); + + // Never rebuild while heavy preallocation is active to avoid frame-time spikes. + // Full geometry preallocation must finish before we start building AS. + const auto currentLoadState = currentInternalLoadingState.load(std::memory_order_relaxed); + const bool preallocActive = (currentLoadState == InternalLoadingState::Parsing || + currentLoadState == InternalLoadingState::Preallocating || + pendingEntityPreallocQueued.load(std::memory_order_relaxed)); + + if (readiness < buildThreshold && !asDevOverrideAllowRebuild && !deferralTimedOut) { + // ... defer logic ... + } else if ((!intervalPassed && !significantDelta && !asDevOverrideAllowRebuild && !isInitialPostLoad) || preallocActive) { + // Skip build this frame to maintain frame rate or wait for preallocation to finish. + } else { + if (deferralTimedOut && readiness < buildThreshold && !asDevOverrideAllowRebuild) { + std::cout << "AS build forced after " << maxDeferralSeconds + << "s deferral (readiness " << readyRenderableCount << "/" << totalRenderableEntities + << ", uniqueMeshesReady=" << readyUniqueMeshCount << ")\n"; + } + struct WatchdogSuppressGuard { + std::atomic& flag; + explicit WatchdogSuppressGuard(std::atomic& f) : flag(f) { + flag.store(true, std::memory_order_relaxed); + } + ~WatchdogSuppressGuard() { + flag.store(false, std::memory_order_relaxed); + } + } watchdogGuard(watchdogSuppressed); + + // Ensure previous GPU work is complete BEFORE building AS. + // + // Wait for all *other* frame-in-flight fences to signal using a finite timeout loop + // and kick the watchdog while we wait. + // We already wait for the frameTimeline at the start of Render(), + // which ensures the GPU has finished the previous frame's work. + // Redundant inFlightFences wait removed to avoid deadlock with timeline-only sync. + { + // No-op + } + + watchdogProgressLabel.store("Render: buildAccelerationStructures", std::memory_order_relaxed); + if (IsLoading()) { + SetLoadingPhase(LoadingPhase::AccelerationStructures); + } + if (buildAccelerationStructures(entities)) { + watchdogProgressLabel.store("Render: after buildAccelerationStructures", std::memory_order_relaxed); + asBuildRequested.store(false, std::memory_order_release); + asBuildRequestStartNs.store(0, std::memory_order_relaxed); + // AS build request resolved; restore normal watchdog sensitivity. + watchdogSuppressed.store(false, std::memory_order_relaxed); + // Transition the loading UI to a finalizing phase (descriptor cold-init, etc.). + if (IsLoading()) { + SetLoadingPhase(LoadingPhase::Finalizing); + SetLoadingPhaseProgress(0.0f); + } + + // The TLAS handle can transition from null -> valid (or change on rebuild). + // Ensure raster PBR descriptor sets (set 0, binding 11 `tlas`) are rewritten after an AS build + // so subsequent Raster draws never see an unwritten/stale acceleration-structure descriptor. + for (auto& kv : entityResources) { + kv.second.pbrFixedBindingsWritten.assign(MAX_FRAMES_IN_FLIGHT, false); + } + for (Entity* e : entities) { + MarkEntityDescriptorsDirty(e); + } + + // Freeze only when the built AS covers EVERY renderable entity. + // This ensures that subsequent streaming (if any) or late-arriving meshes can still trigger a rebuild + // until the scene is truly 100% complete. + if (asFreezeAfterFullBuild) { + if (totalRenderableEntities > 0 && lastASBuiltInstanceCount >= totalRenderableEntities) { + asFrozen = true; + } + } + + // One concise TLAS summary with consistent units. + if (!!*tlasStructure.handle) { + if (IsRayQueryStaticOnly()) { + std::cout << "TLAS ready (static-only): tlasInstances=" << lastASBuiltTlasInstanceCount + << ", entities=" << lastASBuiltInstanceCount + << ", BLAS=" << lastASBuiltBLASCount + << ", addr=0x" << std::hex << tlasStructure.deviceAddress << std::dec << std::endl; + } else { + std::cout << "TLAS ready: tlasInstances=" << lastASBuiltTlasInstanceCount + << ", entities=" << lastASBuiltInstanceCount + << ", BLAS=" << lastASBuiltBLASCount + << ", addr=0x" << std::hex << tlasStructure.deviceAddress << std::dec << std::endl; + } + } + } else { + if (!accelerationStructureEnabled || !rayQueryEnabled) { + // Permanent failure due to lack of support; do not retry. + asBuildRequested.store(false, std::memory_order_release); + asBuildRequestStartNs.store(0, std::memory_order_relaxed); + watchdogSuppressed.store(false, std::memory_order_relaxed); + } else { + // If nothing is ready yet (e.g., mesh uploads still pending), don't spam logs. + if (readyRenderableCount > 0 || readyUniqueMeshCount > 0) { + std::cout << "Failed to build acceleration structures, will retry next frame" << std::endl; + } + } + } + // Reset dev override after one use + asDevOverrideAllowRebuild = false; + } + } + } + + // Safe point: the previous work referencing this frame's descriptor sets is complete. + // Apply any deferred descriptor set updates for entities whose textures finished streaming. + watchdogProgressLabel.store("Render: ProcessDirtyDescriptorsForFrame", std::memory_order_relaxed); + ProcessDirtyDescriptorsForFrame(currentFrame); + watchdogProgressLabel.store("Render: after ProcessDirtyDescriptorsForFrame", std::memory_order_relaxed); + + // --- 1. PREPARATION PASS --- + // Gather active entities with mesh resources, perform per-frame descriptor initialization, + // and execute culling. This single pass replaces multiple redundant scans and reduces map lookups. + std::vector opaqueJobs; + std::vector transparentJobs; + opaqueJobs.reserve(entities.size()); + + // Optimization: skip scene rendering while initial scene loading is active or no camera exists. + // The loading overlay (rendered via ImGui at the end) is sufficient. + if (camera && loadDone) { + watchdogProgressLabel.store("Render: preparation pass", std::memory_order_relaxed); + + // Prepare frustum once per frame for culling + FrustumPlanes frustum{}; + const bool doCulling = enableFrustumCulling && camera; + if (doCulling && camera) { + glm::mat4 proj = camera->GetProjectionMatrix(); + proj[1][1] *= -1.0f; + const glm::mat4 vp = proj * camera->GetViewMatrix(); + frustum = extractFrustumPlanes(vp); + } + lastCullingVisibleCount = 0; + lastCullingCulledCount = 0; + + uint32_t entityProcessCount = 0; + std::vector activeEntities; + activeEntities.reserve(entities.size()); + for (Entity* entity : entities) { + if (entity && entity->IsActive()) activeEntities.push_back(entity); + } + + uint32_t coldInitBurst = 0; + uint32_t processedInPass = 0; + // STAGGERED ACTIVATION: Only process a subset of entities for the first few frames + // to avoid a massive CPU spike on the first game frame in Debug mode. + uint64_t maxToProcess = entities.size(); + if (postLoadFrameCount < 100) { + maxToProcess = std::min((uint64_t)entities.size(), 100 * postLoadFrameCount + 500); + } + + for (Entity* entity : activeEntities) { + if (++processedInPass > maxToProcess) break; + + // Kick watchdog periodically during heavy preparation pass + if (processedInPass % 100 == 0) { + KickWatchdog(); + } + + auto meshComponent = entity->GetComponent(); + if (!meshComponent) + continue; + + EntityResources* pEntityRes = nullptr; + MeshResources* pMeshRes = nullptr; + { + std::shared_lock entityLock(entityResourcesMutex); + auto entityIt = entityResources.find(entity); + if (entityIt != entityResources.end()) pEntityRes = &entityIt->second; + } + { + std::shared_lock meshLock(meshResourcesMutex); + auto meshIt = meshResources.find(meshComponent); + if (meshIt != meshResources.end()) pMeshRes = &meshIt->second; + } + + if (!pEntityRes || !pMeshRes) + continue; + + EntityResources& entityRes = *pEntityRes; + MeshResources& meshRes = *pMeshRes; + + // Ensure material cache is valid once per frame + ensureEntityMaterialCache(entity, entityRes); + + // --- Per-frame Descriptor Cold-Init (Integrated) --- + // OPTIMIZATION: Stagger initial creation/updates for huge scenes to avoid main-thread hangs + // During post-load initialization, increase the burst size so pink fallback clears faster. + const uint32_t maxColdInitPerFrame = (isPostLoad && postLoadFrameCount < 200) ? 1000 : 50; + if (entityRes.basicDescriptorSets.empty() || entityRes.pbrDescriptorSets.empty()) { + if (++coldInitBurst > maxColdInitPerFrame) continue; + std::string texPath = meshComponent->GetBaseColorTexturePath(); + if (texPath.empty()) texPath = meshComponent->GetTexturePath(); + if (entityRes.basicDescriptorSets.empty()) createDescriptorSets(entity, entityRes, texPath, false); + if (entityRes.pbrDescriptorSets.empty()) createDescriptorSets(entity, entityRes, texPath, true); + } + + // Initialize binding 0 (UBO) for the current frame slot if not already done. + if (!entityRes.pbrUboBindingWritten[currentFrame] || !entityRes.basicUboBindingWritten[currentFrame]) { + if (++coldInitBurst > maxColdInitPerFrame) continue; + std::string texPath = meshComponent->GetBaseColorTexturePath(); + if (texPath.empty()) texPath = meshComponent->GetTexturePath(); + if (!entityRes.pbrUboBindingWritten[currentFrame]) { + updateDescriptorSetsForFrame(entity, entityRes, texPath, true, currentFrame, false, true); + } + if (!entityRes.basicUboBindingWritten[currentFrame]) { + updateDescriptorSetsForFrame(entity, entityRes, texPath, false, currentFrame, false, true); + } + } + + // Initialize images for the current frame slot if not already done. + if (!entityRes.pbrImagesWritten[currentFrame] || !entityRes.basicImagesWritten[currentFrame]) { + if (++coldInitBurst > maxColdInitPerFrame) continue; + std::string texPath = meshComponent->GetBaseColorTexturePath(); + if (texPath.empty()) texPath = meshComponent->GetTexturePath(); + if (!entityRes.pbrImagesWritten[currentFrame]) { + updateDescriptorSetsForFrame(entity, entityRes, texPath, true, currentFrame, true, false); + entityRes.pbrImagesWritten[currentFrame] = true; + } + if (!entityRes.basicImagesWritten[currentFrame]) { + updateDescriptorSetsForFrame(entity, entityRes, texPath, false, currentFrame, true, false); + entityRes.basicImagesWritten[currentFrame] = true; + } + } + + // --- Culling & Classification --- + auto* tc = entity->GetComponent(); + bool useBlended = entityRes.cachedIsBlended; + + if (meshComponent->HasLocalAABB()) { + const glm::mat4 model = tc ? tc->GetModelMatrix() : glm::mat4(1.0f); + glm::vec3 wmin, wmax; + transformAABB(model, meshComponent->GetLocalAABBMin(), meshComponent->GetLocalAABBMax(), wmin, wmax); + + // 1. Frustum Culling + if (doCulling && !aabbIntersectsFrustum(wmin, wmax, frustum)) { + lastCullingCulledCount++; + continue; + } + + // 2. Distance-based LOD + if (enableDistanceLOD && camera) { + glm::vec3 camPos = camera->GetPosition(); + bool cameraInside = (camPos.x >= wmin.x && camPos.x <= wmax.x && + camPos.y >= wmin.y && camPos.y <= wmax.y && + camPos.z >= wmin.z && camPos.z <= wmax.z); + if (!cameraInside) { + float dx = std::max({0.0f, wmin.x - camPos.x, camPos.x - wmax.x}); + float dy = std::max({0.0f, wmin.y - camPos.y, camPos.y - wmax.y}); + float dz = std::max({0.0f, wmin.z - camPos.z, camPos.z - wmax.z}); + float dist = std::sqrt(dx * dx + dy * dy + dz * dz); + float z_eff = std::max(0.1f, dist); + float fov = glm::radians(camera->GetFieldOfView()); + float radius = glm::length(0.5f * (wmax - wmin)); + float pixelDiameter = (radius * 2.0f * static_cast(swapChainExtent.height)) / (z_eff * 2.0f * std::tan(fov * 0.5f)); + float threshold = useBlended ? lodPixelThresholdTransparent : lodPixelThresholdOpaque; + if (pixelDiameter < threshold) { + lastCullingCulledCount++; + continue; + } + } + } + } + + lastCullingVisibleCount++; + bool isAlphaMasked = false; + if (entityRes.materialCacheValid) { + isAlphaMasked = (entityRes.cachedMaterialProps.alphaMask > 0.5f); + } + + // Update UBO for visible entity once per frame (shared across all main passes) + updateUniformBuffer(currentFrame, entity, &entityRes, camera, tc); + + RenderJob job{entity, &entityRes, &meshRes, meshComponent, tc, isAlphaMasked}; + if (useBlended) { + transparentJobs.push_back(job); + } else { + opaqueJobs.push_back(job); + } + + // Update watchdog periodically + if (++entityProcessCount % 100 == 0) { + lastFrameUpdateTime.store(std::chrono::steady_clock::now(), std::memory_order_relaxed); + } + } + watchdogProgressLabel.store("Render: after preparation pass", std::memory_order_relaxed); + } + + // Safe point: flush any descriptor updates that were deferred while a command buffer + // was recording in a prior frame. Only apply ops for the current frame to avoid + // update-after-bind on pending frames. + if (descriptorRefreshPending.load(std::memory_order_relaxed)) { + watchdogProgressLabel.store("Render: flush deferred descriptor ops", std::memory_order_relaxed); + std::vector ops; { + std::lock_guard lk(pendingDescMutex); + ops.swap(pendingDescOps); + descriptorRefreshPending.store(false, std::memory_order_relaxed); + } + uint32_t opCount = 0; + for (auto& op : ops) { + // Kick watchdog periodically during potentially heavy descriptor update bursts + if ((++opCount % 50u) == 0u) { + lastFrameUpdateTime.store(std::chrono::steady_clock::now(), std::memory_order_relaxed); + } + + if (op.frameIndex == currentFrame) { + // Now not recording; safe to apply updates for this frame + updateDescriptorSetsForFrame(op.entity, op.texPath, op.usePBR, op.frameIndex, op.imagesOnly); + } else { + // Keep other frame ops queued for next frame’s safe point + std::lock_guard lk(pendingDescMutex); + pendingDescOps.push_back(op); + descriptorRefreshPending.store(true, std::memory_order_relaxed); + } + } + watchdogProgressLabel.store("Render: after deferred descriptor ops", std::memory_order_relaxed); + } + + // Safe point: handle any pending reflection resource (re)creation and per-frame descriptor refreshes + if (reflectionResourcesDirty) { + if (enablePlanarReflections) { + uint32_t rw = std::max(1u, static_cast(static_cast(swapChainExtent.width) * reflectionResolutionScale)); + uint32_t rh = std::max(1u, static_cast(static_cast(swapChainExtent.height) * reflectionResolutionScale)); + createReflectionResources(rw, rh); + } else { + destroyReflectionResources(); + } + reflectionResourcesDirty = false; + } + + // Reflection descriptor binding refresh is handled elsewhere; avoid redundant per-frame mass updates here. + // Pick the VP associated with the previous frame's reflection texture for sampling in the main pass + if (enablePlanarReflections && !reflectionVPs.empty()) { + uint32_t prev = (currentFrame > 0) ? (currentFrame - 1) : (static_cast(reflectionVPs.size()) - 1); + sampleReflectionVP = reflectionVPs[prev]; + } + + // This function updates bindings 6/7/8 (storage buffers) which don't have UPDATE_AFTER_BIND. + // Updating these every frame causes "updated without UPDATE_AFTER_BIND" errors with MAX_FRAMES_IN_FLIGHT > 1. + // These bindings are already initialized in createDescriptorSets and updated when buffers change. + // Binding 10 (reflection map) has UPDATE_AFTER_BIND and can be updated separately if needed. + // refreshPBRForwardPlusBindingsForFrame(currentFrame); + + // Acquire next swapchain image + // acquireNextImage returns imageIndex (which swapchain image is available). + // Use currentFrame to select an imageAvailableSemaphore for acquire. + // Use imageIndex to select renderFinishedSemaphore for present (ties semaphore to the specific image). + const uint32_t acquireSemaphoreIndex = currentFrame % static_cast(imageAvailableSemaphores.size()); + + uint32_t imageIndex; + vk::Result acquireResultCode = vk::Result::eSuccess; + // Helper overloads to normalize acquireNextImage return across Vulkan-Hpp versions + auto extractAcquire = [](auto const& ret, vk::Result& code, uint32_t& idx) { + using RetT = std::decay_t; + if constexpr (std::is_same_v>) { + code = ret.result; + idx = ret.value; + } else { + // Assume older std::pair + code = ret.first; + idx = ret.second; + } + }; + try { + watchdogProgressLabel.store("Render: acquireNextImage", std::memory_order_relaxed); + // Use a 100ms timeout to avoid infinite hangs in headless/CI environments. + // If acquire fails after 100ms, we skip the frame and return. + auto acquireRet = swapChain.acquireNextImage(100'000'000, *imageAvailableSemaphores[acquireSemaphoreIndex]); + // Vulkan-Hpp changed the return type of acquireNextImage for RAII swapchain across versions. + // Support both vk::ResultValue (newer) and std::pair (older). + extractAcquire(acquireRet, acquireResultCode, imageIndex); + } catch (const vk::OutOfDateKHRError&) { + watchdogProgressLabel.store("Render: acquireNextImage out-of-date", std::memory_order_relaxed); + // Swapchain is out of date (e.g., window resized) before we could + // query the result. Trigger recreation and exit this frame cleanly. + framebufferResized.store(true, std::memory_order_relaxed); + if (imguiSystem) + ImGui::EndFrame(); + // IMPORTANT: We already reset the in-flight fence at the start of the frame. + // Because we're exiting early (no submit), signal it via an empty submit so + // swapchain recreation won't hang waiting for an unsignaled fence. + { + vk::SubmitInfo2 emptySubmit2{}; + Submit2(*graphicsQueue, emptySubmit2, *inFlightFences[currentFrame]); + } + recreateSwapChain(); + return; + } + + // imageIndex already populated above + watchdogProgressLabel.store("Render: acquired swapchain image", std::memory_order_relaxed); + + if (acquireResultCode == vk::Result::eTimeout) { + // Expected in headless/CI environments where the window may not be visible. + // Return early without error so the engine loop can continue. + if (imguiSystem) + ImGui::EndFrame(); + + // Signal the timeline even on timeout to avoid deadlocking subsequent frames + // that wait for this frame's completion. + { + vk::SubmitInfo2 emptySubmit2{}; + vk::SemaphoreSubmitInfo signalInfo{ + .semaphore = *frameTimeline, + .value = currentTimelineValue + TimelineMilestones::eGpuWorkFinished, + .stageMask = vk::PipelineStageFlagBits2::eAllCommands + }; + emptySubmit2.signalSemaphoreInfoCount = 1; + emptySubmit2.pSignalSemaphoreInfos = &signalInfo; + Submit2(*graphicsQueue, emptySubmit2, nullptr); + } + return; + } + if (acquireResultCode == vk::Result::eSuboptimalKHR || framebufferResized.load(std::memory_order_relaxed)) { + framebufferResized.store(false, std::memory_order_relaxed); + if (imguiSystem) + ImGui::EndFrame(); + // Fence was reset earlier; ensure it is signaled before we bail out + // to avoid a deadlock in swapchain recreation. + { + vk::SubmitInfo2 emptySubmit2{}; + Submit2(*graphicsQueue, emptySubmit2, *inFlightFences[currentFrame]); + } + recreateSwapChain(); + return; + } + if (acquireResultCode != vk::Result::eSuccess) { + throw std::runtime_error("Failed to acquire swap chain image"); + } + + if (framebufferResized.load(std::memory_order_relaxed)) { + // Signal the fence via empty submit since no real work will be submitted + // this frame, preventing a wait on an unsignaled fence during resize. + { + vk::SubmitInfo2 emptySubmit2{}; + Submit2(*graphicsQueue, emptySubmit2, *inFlightFences[currentFrame]); + } + recreateSwapChain(); + return; + } + + // Perform any descriptor updates that must not happen during command buffer recording + if (useForwardPlus) { + uint32_t tilesX_pre = (swapChainExtent.width + forwardPlusTileSizeX - 1) / forwardPlusTileSizeX; + uint32_t tilesY_pre = (swapChainExtent.height + forwardPlusTileSizeY - 1) / forwardPlusTileSizeY; + // Only update current frame's descriptors to avoid touching in-flight frames + createOrResizeForwardPlusBuffers(tilesX_pre, tilesY_pre, forwardPlusSlicesZ, /*updateOnlyCurrentFrame=*/true); + // After (re)creating Forward+ buffers, bindings 7/8 will be refreshed as needed. + } + + // Ensure light buffers are sufficiently large before recording to avoid resizing while in use + { + // Reserve capacity based on emissive lights only (punctual lights disabled for now) + size_t desiredLightCapacity = 0; + if (!staticLights.empty()) { + size_t emissiveCount = 0; + for (const auto& L : staticLights) { + if (L.type == ExtractedLight::Type::Emissive) { + ++emissiveCount; + if (emissiveCount >= MAX_ACTIVE_LIGHTS) + break; + } + } + desiredLightCapacity = emissiveCount; + } + if (desiredLightCapacity > 0) { + createOrResizeLightStorageBuffers(desiredLightCapacity); + // Ensure compute (binding 0) sees the current frame's lights buffer + refreshForwardPlusComputeLightsBindingForFrame(currentFrame); + // Bindings 6/7/8 for PBR are refreshed only when buffers change (handled in resize path). + } + } + + // Safe point: Update ray query descriptor sets if ray query mode is active + // This MUST happen before command buffer recording starts to avoid "descriptor updated without UPDATE_AFTER_BIND" errors + if (currentRenderMode == RenderMode::RayQuery && rayQueryEnabled && accelerationStructureEnabled) { + if (!!*tlasStructure.handle) { + watchdogProgressLabel.store("Render: updateRayQueryDescriptorSets", std::memory_order_relaxed); + updateRayQueryDescriptorSets(currentFrame, entities); + watchdogProgressLabel.store("Render: after updateRayQueryDescriptorSets", std::memory_order_relaxed); + } + } + + // Refit TLAS if needed (either for Ray Query mode or for Raster shadows) + // Skip during initial 20 post-load frames to ensure smooth verification. + const bool needTLAS = (currentRenderMode == RenderMode::RayQuery || enableRasterRayQueryShadows) && accelerationStructureEnabled; + if (needTLAS && !!*tlasStructure.handle && postLoadFrameCount > 20) { + if (!IsRayQueryStaticOnly()) { + watchdogProgressLabel.store("Render: refitTopLevelAS", std::memory_order_relaxed); + refitTopLevelAS(entities, camera); + } + } + + commandBuffers[currentFrame].reset(); + // Begin command buffer recording for this frame + commandBuffers[currentFrame].begin(vk::CommandBufferBeginInfo()); + isRecordingCmd.store(true, std::memory_order_relaxed); + + if (framebufferResized.load(std::memory_order_relaxed)) { + commandBuffers[currentFrame].end(); + recreateSwapChain(); + return; + } + + // Ray query rendering mode dispatch + if (currentRenderMode == RenderMode::RayQuery && rayQueryEnabled && accelerationStructureEnabled) { + // Check if TLAS handle is valid (dereference RAII handle) + if (!*tlasStructure.handle) { + // TLAS not built yet. + // During loading, allow the raster path (and the progress overlay) to render normally + // instead of presenting a diagnostic magenta frame. + if (!IsLoading()) { + // If we are in Ray Query mode but AS is not built yet, don't just show magenta. + // Fall back to Rasterization so the user sees something while the background build proceeds. + static auto lastPinkLog = std::chrono::steady_clock::now(); + auto now = std::chrono::steady_clock::now(); + if (std::chrono::duration_cast(now - lastPinkLog).count() >= 5) { + std::cout << "Ray Query active but TLAS not ready; falling back to Rasterization for this frame." << std::endl; + lastPinkLog = now; + } + rayQueryRenderedThisFrame = false; // Proceed to raster path + } + } else { + // TLAS is valid and descriptor sets were already updated at safe point + // Proceed with ray query rendering + // Bind ray query compute pipeline + commandBuffers[currentFrame].bindPipeline(vk::PipelineBindPoint::eCompute, *rayQueryPipeline); + + // Bind descriptor set + commandBuffers[currentFrame].bindDescriptorSets( + vk::PipelineBindPoint::eCompute, + *rayQueryPipelineLayout, + 0, + *rayQueryDescriptorSets[currentFrame], + nullptr); + + // This dedicated UBO is separate from entity UBOs and uses a Ray Query-specific layout. + if (rayQueryUniformBuffersMapped.size() > currentFrame && rayQueryUniformBuffersMapped[currentFrame]) { + RayQueryUniformBufferObject ubo{}; + ubo.model = glm::mat4(1.0f); // Identity - not used for ray query + + // Force view matrix update to reflect current camera position + // (the dirty flag isn't automatically set when camera position changes) + camera->ForceViewMatrixUpdate(); + + // Get camera matrices + glm::mat4 camView = camera->GetViewMatrix(); + ubo.view = camView; + ubo.proj = camera->GetProjectionMatrix(); + ubo.proj[1][1] *= -1; // Flip Y for Vulkan + ubo.camPos = glm::vec4(camera->GetPosition(), 1.0f); + // Clamp to sane ranges to avoid black output (exposure=0 → 1-exp(0)=0) + ubo.exposure = std::clamp(exposure, 0.2f, 4.0f); + ubo.gamma = std::clamp(gamma, 1.6f, 2.6f); + // Match raster convention: ambient scale factor for simple IBL/ambient term. + // (Raster defaults to ~1.0 in the main pass; keep Ray Query consistent.) + ubo.scaleIBLAmbient = 1.0f; + // Provide the per-frame light count so the ray query shader can iterate lights. + ubo.lightCount = static_cast(lastFrameLightCount); + ubo.screenDimensions = glm::vec2(swapChainExtent.width, swapChainExtent.height); + ubo.enableRayQueryReflections = enableRayQueryReflections ? 1 : 0; + ubo.enableRayQueryTransparency = enableRayQueryTransparency ? 1 : 0; + // Max secondary bounces (reflection/refraction). Stored in the padding slot to avoid UBO layout churn. + // Shader clamps this value. + ubo._pad0 = rayQueryMaxBounces; + // Thick-glass toggles and tuning + ubo.enableThickGlass = enableThickGlass ? 1 : 0; + ubo.thicknessClamp = thickGlassThicknessClamp; + ubo.absorptionScale = thickGlassAbsorptionScale; + // Ray Query hard shadows (see `shaders/ray_query.slang`) + ubo._pad1 = enableRayQueryShadows ? 1 : 0; + ubo.shadowSampleCount = std::clamp(rayQueryShadowSampleCount, 1, 32); + ubo.shadowSoftness = std::clamp(rayQueryShadowSoftness, 0.0f, 1.0f); + ubo.reflectionIntensity = reflectionIntensity; + // Provide geometry info count for shader-side bounds checking (per-instance) + ubo.geometryInfoCount = static_cast(tlasInstanceCount); + // Provide material buffer count for shader-side bounds checking + ubo.materialCount = static_cast(materialCountCPU); + + // Copy to mapped memory + std::memcpy(rayQueryUniformBuffersMapped[currentFrame], &ubo, sizeof(RayQueryUniformBufferObject)); + } else { + // Keep concise error for visibility + std::cerr << "Ray Query UBO not mapped for frame " << currentFrame << "\n"; + } + + // Dispatch compute shader (8x8 workgroups as defined in shader) + uint32_t workgroupsX = (swapChainExtent.width + 7) / 8; + uint32_t workgroupsY = (swapChainExtent.height + 7) / 8; + commandBuffers[currentFrame].dispatch(workgroupsX, workgroupsY, 1); + + // Barrier: wait for compute shader to finish writing to output image, + // then make it readable by fragment shader for sampling in composite pass + vk::ImageMemoryBarrier2 rqToSample{}; + rqToSample.srcStageMask = vk::PipelineStageFlagBits2::eComputeShader; + rqToSample.srcAccessMask = vk::AccessFlagBits2::eShaderWrite; + rqToSample.dstStageMask = vk::PipelineStageFlagBits2::eFragmentShader; + rqToSample.dstAccessMask = vk::AccessFlagBits2::eShaderRead; + rqToSample.oldLayout = vk::ImageLayout::eGeneral; + rqToSample.newLayout = vk::ImageLayout::eShaderReadOnlyOptimal; + rqToSample.image = *rayQueryOutputImage; + rqToSample.subresourceRange.aspectMask = vk::ImageAspectFlagBits::eColor; + rqToSample.subresourceRange.levelCount = 1; + rqToSample.subresourceRange.layerCount = 1; + + vk::DependencyInfo depRQToSample{}; + depRQToSample.imageMemoryBarrierCount = 1; + depRQToSample.pImageMemoryBarriers = &rqToSample; + commandBuffers[currentFrame].pipelineBarrier2(depRQToSample); + + // Composite fullscreen: sample rayQueryOutputImage to the swapchain using the composite pipeline + // Transition swapchain image to COLOR_ATTACHMENT_OPTIMAL + vk::ImageMemoryBarrier2 swapchainToColor{}; + swapchainToColor.srcStageMask = vk::PipelineStageFlagBits2::eBottomOfPipe; + swapchainToColor.srcAccessMask = vk::AccessFlagBits2::eNone; + swapchainToColor.dstStageMask = vk::PipelineStageFlagBits2::eColorAttachmentOutput; + swapchainToColor.dstAccessMask = vk::AccessFlagBits2::eColorAttachmentWrite | vk::AccessFlagBits2::eColorAttachmentRead; + swapchainToColor.oldLayout = (imageIndex < swapChainImageLayouts.size()) ? swapChainImageLayouts[imageIndex] : vk::ImageLayout::eUndefined; + swapchainToColor.newLayout = vk::ImageLayout::eColorAttachmentOptimal; + swapchainToColor.image = swapChainImages[imageIndex]; + swapchainToColor.subresourceRange.aspectMask = vk::ImageAspectFlagBits::eColor; + swapchainToColor.subresourceRange.levelCount = 1; + swapchainToColor.subresourceRange.layerCount = 1; + vk::DependencyInfo depSwapToColor{.imageMemoryBarrierCount = 1, .pImageMemoryBarriers = &swapchainToColor}; + commandBuffers[currentFrame].pipelineBarrier2(depSwapToColor); + if (imageIndex < swapChainImageLayouts.size()) + swapChainImageLayouts[imageIndex] = swapchainToColor.newLayout; + + // Begin dynamic rendering for composite (no depth) + colorAttachments[0].imageView = *swapChainImageViews[imageIndex]; + colorAttachments[0].loadOp = vk::AttachmentLoadOp::eClear; + depthAttachment.loadOp = vk::AttachmentLoadOp::eDontCare; + renderingInfo.renderArea = vk::Rect2D({0, 0}, swapChainExtent); + auto savedDepthPtr2 = renderingInfo.pDepthAttachment; + renderingInfo.pDepthAttachment = nullptr; + commandBuffers[currentFrame].beginRendering(renderingInfo); + + if (!!*compositePipeline) { + commandBuffers[currentFrame].bindPipeline(vk::PipelineBindPoint::eGraphics, *compositePipeline); + } + vk::Viewport vp(0.0f, 0.0f, static_cast(swapChainExtent.width), static_cast(swapChainExtent.height), 0.0f, 1.0f); + vk::Rect2D sc({0, 0}, swapChainExtent); + commandBuffers[currentFrame].setViewport(0, vp); + commandBuffers[currentFrame].setScissor(0, sc); + + // Bind the RQ composite descriptor set (samples rayQueryOutputImage) + if (!rqCompositeDescriptorSets.empty()) { + commandBuffers[currentFrame].bindDescriptorSets( + vk::PipelineBindPoint::eGraphics, + *compositePipelineLayout, + 0, + {*rqCompositeDescriptorSets[currentFrame]}, + {}); + } + + // Push exposure/gamma and sRGB flag + struct CompositePush { + float exposure; + float gamma; + int outputIsSRGB; + float _pad; + } pc2{}; + pc2.exposure = std::clamp(this->exposure, 0.2f, 4.0f); + pc2.gamma = this->gamma; + pc2.outputIsSRGB = (swapChainImageFormat == vk::Format::eR8G8B8A8Srgb || swapChainImageFormat == vk::Format::eB8G8R8A8Srgb) ? 1 : 0; + commandBuffers[currentFrame].pushConstants(*compositePipelineLayout, vk::ShaderStageFlagBits::eFragment, 0, pc2); + + commandBuffers[currentFrame].draw(3, 1, 0, 0); + commandBuffers[currentFrame].endRendering(); + renderingInfo.pDepthAttachment = savedDepthPtr2; + + // Transition swapchain back to PRESENT and RQ image back to GENERAL for next frame + vk::ImageMemoryBarrier2 swapchainToPresent{}; + swapchainToPresent.srcStageMask = vk::PipelineStageFlagBits2::eColorAttachmentOutput; + swapchainToPresent.srcAccessMask = vk::AccessFlagBits2::eColorAttachmentWrite; + swapchainToPresent.dstStageMask = vk::PipelineStageFlagBits2::eBottomOfPipe; + swapchainToPresent.dstAccessMask = vk::AccessFlagBits2::eNone; + swapchainToPresent.oldLayout = vk::ImageLayout::eColorAttachmentOptimal; + swapchainToPresent.newLayout = vk::ImageLayout::ePresentSrcKHR; + swapchainToPresent.image = swapChainImages[imageIndex]; + swapchainToPresent.subresourceRange.aspectMask = vk::ImageAspectFlagBits::eColor; + swapchainToPresent.subresourceRange.levelCount = 1; + swapchainToPresent.subresourceRange.layerCount = 1; + + vk::ImageMemoryBarrier2 rqBackToGeneral{}; + rqBackToGeneral.srcStageMask = vk::PipelineStageFlagBits2::eFragmentShader; + rqBackToGeneral.srcAccessMask = vk::AccessFlagBits2::eShaderRead; + rqBackToGeneral.dstStageMask = vk::PipelineStageFlagBits2::eComputeShader; + rqBackToGeneral.dstAccessMask = vk::AccessFlagBits2::eShaderWrite; + rqBackToGeneral.oldLayout = vk::ImageLayout::eShaderReadOnlyOptimal; + rqBackToGeneral.newLayout = vk::ImageLayout::eGeneral; + rqBackToGeneral.image = *rayQueryOutputImage; + rqBackToGeneral.subresourceRange.aspectMask = vk::ImageAspectFlagBits::eColor; + rqBackToGeneral.subresourceRange.levelCount = 1; + rqBackToGeneral.subresourceRange.layerCount = 1; + + std::array barriers{swapchainToPresent, rqBackToGeneral}; + vk::DependencyInfo depEnd{.imageMemoryBarrierCount = static_cast(barriers.size()), .pImageMemoryBarriers = barriers.data()}; + commandBuffers[currentFrame].pipelineBarrier2(depEnd); + if (imageIndex < swapChainImageLayouts.size()) + swapChainImageLayouts[imageIndex] = swapchainToPresent.newLayout; + + // Ray query rendering complete - set flag to skip rasterization code path + rayQueryRenderedThisFrame = true; + } + } + + // Process texture streaming uploads (see Renderer::ProcessPendingTextureJobs) + + vk::raii::Pipeline* currentPipeline = nullptr; + vk::raii::PipelineLayout* currentLayout = nullptr; + + // Incrementally process pending texture uploads on the main thread so that + // all Vulkan submits happen from a single place while worker threads only + // handle CPU-side decoding. While the loading screen is up, prioritize + // critical textures so the first rendered frame looks mostly correct. + if (IsLoading()) { + // Larger budget while loading screen is visible so we don't stall + // streaming of near-field baseColor textures. + ProcessPendingTextureJobs(/*maxJobs=*/16, /*includeCritical=*/true, /*includeNonCritical=*/false); + } else { + // After loading screen disappears, we want the scene to remain + // responsive (~20 fps) while textures stream in. Limit the number + // of non-critical uploads per frame so we don't tank frame time. + static uint32_t streamingFrameCounter = 0; + streamingFrameCounter++; + // Ray Query needs textures visible quickly; process more streaming work when in Ray Query mode. + if (currentRenderMode == RenderMode::RayQuery) { + // Aggressively drain both critical and non-critical queues each frame for faster bring-up. + ProcessPendingTextureJobs(/*maxJobs=*/32, /*includeCritical=*/true, /*includeNonCritical=*/true); + } else { + // Raster path: keep previous throttling to avoid stalls. + if ((streamingFrameCounter % 3) == 0) { + ProcessPendingTextureJobs(/*maxJobs=*/1, /*includeCritical=*/false, /*includeNonCritical=*/true); + } + } + } + + // Renderer UI - available for both ray query and rasterization modes. + // Hide UI during loading; the progress overlay is handled by ImGuiSystem::NewFrame(). + if (imguiSystem && !imguiSystem->IsFrameRendered() && !IsLoading()) { + if (ImGui::Begin("Renderer")) { + // Declare variables that need to persist across conditional blocks + bool prevFwdPlus = useForwardPlus; + + // === RENDERING MODE SELECTION (TOP) === + ImGui::Text("Rendering Mode:"); + if (rayQueryEnabled && accelerationStructureEnabled) { + const char* modeNames[] = {"Rasterization", "Ray Query"}; + int currentMode = (currentRenderMode == RenderMode::RayQuery) ? 1 : 0; + if (ImGui::Combo("Mode", ¤tMode, modeNames, 2)) { + RenderMode newMode = (currentMode == 1) ? RenderMode::RayQuery : RenderMode::Rasterization; + if (newMode != currentRenderMode) { + currentRenderMode = newMode; + std::cout << "Switched to " << modeNames[currentMode] << " mode\n"; + + // Request acceleration structure build when switching to ray query mode + if (currentRenderMode == RenderMode::RayQuery) { + std::cout << "Requesting acceleration structure build...\n"; + RequestAccelerationStructureBuild(); + } + + // Switching modes can change which pipelines are bound and whether ray-query-dependent + // descriptor bindings (e.g., PBR binding 11 `tlas`) become statically used. + // Mark entity descriptor sets dirty so the next safe point refreshes bindings for this frame. + for (auto& kv : entityResources) { + kv.second.pbrFixedBindingsWritten.assign(MAX_FRAMES_IN_FLIGHT, false); + } + for (Entity* e : entities) { + MarkEntityDescriptorsDirty(e); + } + } + } + } else { + ImGui::TextColored(ImVec4(0.7f, 0.7f, 0.7f, 1.0f), "Rasterization only (ray query not supported)"); + } + + // === RASTERIZATION-SPECIFIC OPTIONS === + if (currentRenderMode == RenderMode::Rasterization) { + ImGui::Separator(); + ImGui::Text("Rasterization Options:"); + + // Lighting Controls - BRDF/PBR is now the default lighting model + bool useBasicLighting = imguiSystem && !imguiSystem->IsPBREnabled(); + if (ImGui::Checkbox("Use Basic Lighting (Phong)", &useBasicLighting)) { + imguiSystem->SetPBREnabled(!useBasicLighting); + std::cout << "Lighting mode: " << (!useBasicLighting ? "BRDF/PBR (default)" : "Basic Phong") << std::endl; + } + + if (!useBasicLighting) { + ImGui::Text("Status: BRDF/PBR pipeline active (default)"); + ImGui::Text("All models rendered with physically-based lighting"); + } else { + ImGui::Text("Status: Basic Phong pipeline active"); + ImGui::Text("All models rendered with basic Phong shading"); + } + + ImGui::Checkbox("Forward+ (tiled light culling)", &useForwardPlus); + if (useForwardPlus && !prevFwdPlus) { + // Lazily create Forward+ resources if enabled at runtime + if (!*forwardPlusPipeline || !*forwardPlusDescriptorSetLayout || forwardPlusPerFrame.empty()) { + createForwardPlusPipelinesAndResources(); + } + if (!*depthPrepassPipeline) { + createDepthPrepassPipeline(); + } + } + + // Raster shadows via ray queries (experimental) + if (rayQueryEnabled && accelerationStructureEnabled) { + ImGui::Checkbox("RayQuery shadows (raster)", &enableRasterRayQueryShadows); + } else { + ImGui::TextDisabled("RayQuery shadows (raster) (requires ray query + AS)"); + } + + // Planar reflections controls + ImGui::Spacing(); + /* + if (ImGui::Checkbox("Planar reflections (experimental)", &enablePlanarReflections)) { + // Defer actual (re)creation/destruction to the next safe point at frame start + reflectionResourcesDirty = true; + } + */ + enablePlanarReflections = false; + float scaleBefore = reflectionResolutionScale; + if (ImGui::SliderFloat("Reflection resolution scale", &reflectionResolutionScale, 0.25f, 1.0f, "%.2f")) { + reflectionResolutionScale = std::clamp(reflectionResolutionScale, 0.25f, 1.0f); + if (enablePlanarReflections&& std::abs(scaleBefore - reflectionResolutionScale) + > + 1e-3f + ) { + reflectionResourcesDirty = true; + } + } + if (enablePlanarReflections && !reflections.empty()) { + auto& rt = reflections[currentFrame]; + if (rt.width > 0) { + ImGui::Text("Reflection RT: %ux%u", rt.width, rt.height); + } + } + } + + // === RAY QUERY-SPECIFIC OPTIONS === + if (currentRenderMode == RenderMode::RayQuery && rayQueryEnabled && accelerationStructureEnabled) { + ImGui::Separator(); + ImGui::Text("Ray Query Status:"); + + // Show acceleration structure status + if (!!*tlasStructure.handle) { + ImGui::TextColored(ImVec4(0.0f, 1.0f, 0.0f, 1.0f), "Acceleration Structures: Built (%zu meshes)", blasStructures.size()); + } else { + ImGui::TextColored(ImVec4(1.0f, 0.5f, 0.0f, 1.0f), "Acceleration Structures: Not built"); + } + + ImGui::Spacing(); + ImGui::Text("Ray Query Features:"); + ImGui::Checkbox("Enable Hard Shadows", &enableRayQueryShadows); + if (enableRayQueryShadows) { + ImGui::SliderInt("Shadow samples", &rayQueryShadowSampleCount, 1, 32); + ImGui::SliderFloat("Shadow softness (fraction of range)", &rayQueryShadowSoftness, 0.0f, 0.2f, "%.3f"); + } + ImGui::Checkbox("Enable Reflections", &enableRayQueryReflections); + ImGui::Checkbox("Enable Transparency/Refraction", &enableRayQueryTransparency); + ImGui::SliderInt("Max secondary bounces", &rayQueryMaxBounces, 0, 10); + // Thick-glass realism controls + ImGui::Separator(); + ImGui::Text("Thick Glass"); + ImGui::Checkbox("Enable Thick Glass", &enableThickGlass); + ImGui::SliderFloat("Thickness Clamp (m)", &thickGlassThicknessClamp, 0.0f, 0.5f, "%.3f"); + ImGui::SliderFloat("Absorption Scale", &thickGlassAbsorptionScale, 0.0f, 4.0f, "%.2f"); + } + + // === SHARED OPTIONS (BOTH MODES) === + ImGui::Separator(); + ImGui::Text("Culling & LOD:"); + if (ImGui::Checkbox("Frustum culling", &enableFrustumCulling)) { + // no-op, takes effect immediately + } + if (ImGui::Checkbox("Distance LOD (projected-size skip)", &enableDistanceLOD)) { + } + ImGui::SliderFloat("LOD threshold opaque (px)", &lodPixelThresholdOpaque, 0.5f, 8.0f, "%.1f"); + ImGui::SliderFloat("LOD threshold transparent (px)", &lodPixelThresholdTransparent, 0.5f, 12.0f, "%.1f"); + // Anisotropy control (recreate samplers on change) + { + float deviceMaxAniso = physicalDevice.getProperties().limits.maxSamplerAnisotropy; + if (ImGui::SliderFloat("Sampler max anisotropy", &samplerMaxAnisotropy, 1.0f, deviceMaxAniso, "%.1f")) { + // Recreate samplers for all textures to apply new anisotropy + std::unique_lock texLock(textureResourcesMutex); + for (auto& kv : textureResources) { + createTextureSampler(kv.second); + } + // Default texture + createTextureSampler(defaultTextureResources); + } + } + if (lastCullingVisibleCount + lastCullingCulledCount > 0) { + ImGui::Text("Culling: visible=%u, culled=%u", lastCullingVisibleCount, lastCullingCulledCount); + } + + // Basic tone mapping controls + ImGui::Separator(); + ImGui::Text("Tone Mapping & Tuning:"); + ImGui::SliderFloat("Reflection intensity", &reflectionIntensity, 0.0f, 2.0f, "%.2f"); + ImGui::SliderFloat("Exposure", &exposure, 0.1f, 4.0f, "%.2f"); + ImGui::SliderFloat("Gamma", &gamma, 1.6f, 2.6f, "%.2f"); + } + ImGui::End(); + } + + // Rasterization rendering: only execute if ray query did not render this frame. + if (!rayQueryRenderedThisFrame) { + // Optional: render planar reflections first + /* + if (enablePlanarReflections) { + glm::vec4 planeWS(0.0f, 1.0f, 0.0f, 0.0f); + renderReflectionPass(commandBuffers[currentFrame], planeWS, camera, opaqueJobs); + } + */ + + // Sort transparent entities back-to-front for correct blending of nested glass/liquids + if (!transparentJobs.empty()) { + glm::vec3 camPos = camera ? camera->GetPosition() : glm::vec3(0.0f); + std::ranges::sort(transparentJobs, + [camPos](const RenderJob& a, const RenderJob& b) { + glm::vec3 pa = a.transformComp ? a.transformComp->GetPosition() : glm::vec3(0.0f); + glm::vec3 pb = b.transformComp ? b.transformComp->GetPosition() : glm::vec3(0.0f); + float da2 = glm::length2(pa - camPos); + float db2 = glm::length2(pb - camPos); + if (da2 != db2) return da2 > db2; + if (a.entityRes->cachedIsLiquid != b.entityRes->cachedIsLiquid) return a.entityRes->cachedIsLiquid; + return a.entity < b.entity; + }); + } + + + // Track whether we executed a depth pre-pass this frame (used to choose depth load op and pipeline state) + bool didOpaqueDepthPrepass = false; + + // Optional Forward+ depth pre-pass for opaque geometry + if (useForwardPlus) { + if (!opaqueJobs.empty()) { + // Transition depth image for attachment write (Sync2) + vk::ImageMemoryBarrier2 depthBarrier2{ + .srcStageMask = vk::PipelineStageFlagBits2::eTopOfPipe, + .srcAccessMask = {}, + .dstStageMask = vk::PipelineStageFlagBits2::eEarlyFragmentTests, + .dstAccessMask = vk::AccessFlagBits2::eDepthStencilAttachmentWrite, + .oldLayout = vk::ImageLayout::eUndefined, + .newLayout = vk::ImageLayout::eDepthAttachmentOptimal, + .srcQueueFamilyIndex = VK_QUEUE_FAMILY_IGNORED, + .dstQueueFamilyIndex = VK_QUEUE_FAMILY_IGNORED, + .image = *depthImage, + .subresourceRange = {vk::ImageAspectFlagBits::eDepth, 0, 1, 0, 1} + }; + vk::DependencyInfo depInfoDepth{.imageMemoryBarrierCount = 1, .pImageMemoryBarriers = &depthBarrier2}; + commandBuffers[currentFrame].pipelineBarrier2(depInfoDepth); + + // Depth-only rendering + vk::RenderingAttachmentInfo depthOnlyAttachment{.imageView = *depthImageView, .imageLayout = vk::ImageLayout::eDepthAttachmentOptimal, .loadOp = vk::AttachmentLoadOp::eClear, .storeOp = vk::AttachmentStoreOp::eStore, .clearValue = vk::ClearDepthStencilValue{1.0f, 0}}; + vk::RenderingInfo depthOnlyInfo{.renderArea = vk::Rect2D({0, 0}, swapChainExtent), .layerCount = 1, .colorAttachmentCount = 0, .pColorAttachments = nullptr, .pDepthAttachment = &depthOnlyAttachment}; + commandBuffers[currentFrame].beginRendering(depthOnlyInfo); + vk::Viewport viewport(0.0f, 0.0f, static_cast(swapChainExtent.width), static_cast(swapChainExtent.height), 0.0f, 1.0f); + commandBuffers[currentFrame].setViewport(0, viewport); + vk::Rect2D scissor({0, 0}, swapChainExtent); + commandBuffers[currentFrame].setScissor(0, scissor); + + // Bind depth pre-pass pipeline + if (!!*depthPrepassPipeline) { + commandBuffers[currentFrame].bindPipeline(vk::PipelineBindPoint::eGraphics, *depthPrepassPipeline); + } + + for (const auto& job : opaqueJobs) { + if (job.isAlphaMasked) continue; + + // Bind geometry + std::array buffers = {*job.meshRes->vertexBuffer, *job.entityRes->instanceBuffer}; + std::array offsets = {0, 0}; + commandBuffers[currentFrame].bindVertexBuffers(0, buffers, offsets); + commandBuffers[currentFrame].bindIndexBuffer(*job.meshRes->indexBuffer, 0, vk::IndexType::eUint32); + + // Bind descriptor set (PBR set 0) + commandBuffers[currentFrame].bindDescriptorSets(vk::PipelineBindPoint::eGraphics, + *pbrPipelineLayout, + 0, + *job.entityRes->pbrDescriptorSets[currentFrame], + nullptr); + + // Issue draw + uint32_t instanceCount = std::max(1u, static_cast(job.meshComp->GetInstanceCount())); + commandBuffers[currentFrame].drawIndexed(job.meshRes->indexCount, instanceCount, 0, 0, 0); + } + + commandBuffers[currentFrame].endRendering(); + + // Barrier to ensure depth is visible for subsequent passes (Sync2) + vk::ImageMemoryBarrier2 depthToRead2{ + .srcStageMask = vk::PipelineStageFlagBits2::eLateFragmentTests, + .srcAccessMask = vk::AccessFlagBits2::eDepthStencilAttachmentWrite, + .dstStageMask = vk::PipelineStageFlagBits2::eEarlyFragmentTests, + .dstAccessMask = vk::AccessFlagBits2::eDepthStencilAttachmentRead, + .oldLayout = vk::ImageLayout::eDepthAttachmentOptimal, + .newLayout = vk::ImageLayout::eDepthAttachmentOptimal, + .srcQueueFamilyIndex = VK_QUEUE_FAMILY_IGNORED, + .dstQueueFamilyIndex = VK_QUEUE_FAMILY_IGNORED, + .image = *depthImage, + .subresourceRange = {vk::ImageAspectFlagBits::eDepth, 0, 1, 0, 1} + }; + vk::DependencyInfo depInfoDepthToRead{.imageMemoryBarrierCount = 1, .pImageMemoryBarriers = &depthToRead2}; + commandBuffers[currentFrame].pipelineBarrier2(depInfoDepthToRead); + + didOpaqueDepthPrepass = true; + } + + // Forward+ compute culling based on current camera and screen tiles + uint32_t tilesX = (swapChainExtent.width + forwardPlusTileSizeX - 1) / forwardPlusTileSizeX; + uint32_t tilesY = (swapChainExtent.height + forwardPlusTileSizeY - 1) / forwardPlusTileSizeY; + + // Lights already extracted at frame start - use lastFrameLightCount for Forward+ params + glm::mat4 view = camera->GetViewMatrix(); + glm::mat4 proj = camera->GetProjectionMatrix(); + proj[1][1] *= -1.0f; + float nearZ = camera->GetNearPlane(); + float farZ = camera->GetFarPlane(); + updateForwardPlusParams(currentFrame, view, proj, lastFrameLightCount, tilesX, tilesY, forwardPlusSlicesZ, nearZ, farZ); + // As a last guard before dispatch, make sure compute binding 0 is valid for this frame + refreshForwardPlusComputeLightsBindingForFrame(currentFrame); + + dispatchForwardPlus(commandBuffers[currentFrame], tilesX, tilesY, forwardPlusSlicesZ); + } + + // PASS 1: RENDER OPAQUE OBJECTS TO OFF-SCREEN TEXTURE + // Transition off-screen color to attachment write (Sync2). On first use after creation or after switching + // from a mode that never produced this image, the layout may still be UNDEFINED. + vk::ImageLayout oscOldLayout = vk::ImageLayout::eUndefined; + vk::PipelineStageFlags2 oscSrcStage = vk::PipelineStageFlagBits2::eTopOfPipe; + vk::AccessFlags2 oscSrcAccess = vk::AccessFlagBits2::eNone; + if (currentFrame < opaqueSceneColorImageLayouts.size()) { + oscOldLayout = opaqueSceneColorImageLayouts[currentFrame]; + if (oscOldLayout == vk::ImageLayout::eShaderReadOnlyOptimal) { + oscSrcStage = vk::PipelineStageFlagBits2::eFragmentShader; + oscSrcAccess = vk::AccessFlagBits2::eShaderRead; + } else if (oscOldLayout == vk::ImageLayout::eColorAttachmentOptimal) { + oscSrcStage = vk::PipelineStageFlagBits2::eColorAttachmentOutput; + oscSrcAccess = vk::AccessFlagBits2::eColorAttachmentWrite; + } else { + oscOldLayout = vk::ImageLayout::eUndefined; + oscSrcStage = vk::PipelineStageFlagBits2::eTopOfPipe; + oscSrcAccess = vk::AccessFlagBits2::eNone; + } + } + vk::ImageMemoryBarrier2 oscToColor2{ + .srcStageMask = oscSrcStage, + .srcAccessMask = oscSrcAccess, + .dstStageMask = vk::PipelineStageFlagBits2::eColorAttachmentOutput, + .dstAccessMask = vk::AccessFlagBits2::eColorAttachmentWrite | vk::AccessFlagBits2::eColorAttachmentRead, + .oldLayout = oscOldLayout, + .newLayout = vk::ImageLayout::eColorAttachmentOptimal, + .srcQueueFamilyIndex = VK_QUEUE_FAMILY_IGNORED, + .dstQueueFamilyIndex = VK_QUEUE_FAMILY_IGNORED, + .image = *opaqueSceneColorImages[currentFrame], + .subresourceRange = {vk::ImageAspectFlagBits::eColor, 0, 1, 0, 1} + }; + vk::DependencyInfo depOscToColor{.imageMemoryBarrierCount = 1, .pImageMemoryBarriers = &oscToColor2}; + commandBuffers[currentFrame].pipelineBarrier2(depOscToColor); + if (currentFrame < opaqueSceneColorImageLayouts.size()) { + opaqueSceneColorImageLayouts[currentFrame] = vk::ImageLayout::eColorAttachmentOptimal; + } + // PASS 1: OFF-SCREEN COLOR (Opaque) + // Clear the off-screen target at the start of opaque rendering to a neutral black background + vk::RenderingAttachmentInfo colorAttachment{.imageView = *opaqueSceneColorImageViews[currentFrame], .imageLayout = vk::ImageLayout::eColorAttachmentOptimal, .loadOp = vk::AttachmentLoadOp::eClear, .storeOp = vk::AttachmentStoreOp::eStore, .clearValue = vk::ClearColorValue(std::array < float, 4 >{0.0f, 0.0f, 0.0f, 1.0f})}; + depthAttachment.imageView = *depthImageView; + depthAttachment.loadOp = (didOpaqueDepthPrepass) ? vk::AttachmentLoadOp::eLoad : vk::AttachmentLoadOp::eClear; + vk::RenderingInfo passInfo{.renderArea = vk::Rect2D({0, 0}, swapChainExtent), .layerCount = 1, .colorAttachmentCount = 1, .pColorAttachments = &colorAttachment, .pDepthAttachment = &depthAttachment}; + commandBuffers[currentFrame].beginRendering(passInfo); + vk::Viewport viewport(0.0f, 0.0f, static_cast(swapChainExtent.width), static_cast(swapChainExtent.height), 0.0f, 1.0f); + commandBuffers[currentFrame].setViewport(0, viewport); + vk::Rect2D scissor({0, 0}, swapChainExtent); + commandBuffers[currentFrame].setScissor(0, scissor); { + uint32_t opaqueDrawsThisPass = 0; + for (const auto& job : opaqueJobs) { + bool useBasic = (imguiSystem && !imguiSystem->IsPBREnabled()); + vk::raii::Pipeline* selectedPipeline = nullptr; + vk::raii::PipelineLayout* selectedLayout = nullptr; + if (useBasic) { + selectedPipeline = &graphicsPipeline; + selectedLayout = &pipelineLayout; + } else { + // If masked, we need depth writes with alpha test; otherwise, after-prepass read-only is fine. + if (job.isAlphaMasked) { + selectedPipeline = &pbrGraphicsPipeline; // writes depth, compare Less + } else { + selectedPipeline = didOpaqueDepthPrepass && !!*pbrPrepassGraphicsPipeline ? &pbrPrepassGraphicsPipeline : &pbrGraphicsPipeline; + } + selectedLayout = &pbrPipelineLayout; + } + if (currentPipeline != selectedPipeline) { + commandBuffers[currentFrame].bindPipeline(vk::PipelineBindPoint::eGraphics, **selectedPipeline); + currentPipeline = selectedPipeline; + currentLayout = selectedLayout; + } + + std::array buffers = {*job.meshRes->vertexBuffer, *job.entityRes->instanceBuffer}; + std::array offsets = {0, 0}; + commandBuffers[currentFrame].bindVertexBuffers(0, buffers, offsets); + commandBuffers[currentFrame].bindIndexBuffer(*job.meshRes->indexBuffer, 0, vk::IndexType::eUint32); + + auto* descSetsPtr = useBasic ? &job.entityRes->basicDescriptorSets : &job.entityRes->pbrDescriptorSets; + if (descSetsPtr->empty() || currentFrame >= descSetsPtr->size()) { + continue; + } + + if (useBasic) { + commandBuffers[currentFrame].bindDescriptorSets( + vk::PipelineBindPoint::eGraphics, + **selectedLayout, + 0, + {*(*descSetsPtr)[currentFrame]}, + {}); + } else { + vk::DescriptorSet set1Opaque = (transparentDescriptorSets.empty() || IsLoading()) + ? *transparentFallbackDescriptorSets[currentFrame] + : *transparentDescriptorSets[currentFrame]; + commandBuffers[currentFrame].bindDescriptorSets( + vk::PipelineBindPoint::eGraphics, + **selectedLayout, + 0, + {*(*descSetsPtr)[currentFrame], set1Opaque}, + {}); + + commandBuffers[currentFrame].pushConstants(**selectedLayout, vk::ShaderStageFlagBits::eFragment, 0, {job.entityRes->cachedMaterialProps}); + } + uint32_t instanceCount = std::max(1u, static_cast(job.meshComp->GetInstanceCount())); + commandBuffers[currentFrame].drawIndexed(job.meshRes->indexCount, instanceCount, 0, 0, 0); + ++opaqueDrawsThisPass; + } + } + commandBuffers[currentFrame].endRendering(); + // PASS 1b: PRESENT – composite path + { + // Transition off-screen to SHADER_READ for sampling (Sync2) + vk::ImageMemoryBarrier2 opaqueToSample2{ + .srcStageMask = vk::PipelineStageFlagBits2::eColorAttachmentOutput, + .srcAccessMask = vk::AccessFlagBits2::eColorAttachmentWrite, + .dstStageMask = vk::PipelineStageFlagBits2::eFragmentShader, + .dstAccessMask = vk::AccessFlagBits2::eShaderRead, + .oldLayout = vk::ImageLayout::eColorAttachmentOptimal, + .newLayout = vk::ImageLayout::eShaderReadOnlyOptimal, + .srcQueueFamilyIndex = VK_QUEUE_FAMILY_IGNORED, + .dstQueueFamilyIndex = VK_QUEUE_FAMILY_IGNORED, + .image = *opaqueSceneColorImages[currentFrame], + .subresourceRange = {vk::ImageAspectFlagBits::eColor, 0, 1, 0, 1} + }; + vk::DependencyInfo depOpaqueToSample{.imageMemoryBarrierCount = 1, .pImageMemoryBarriers = &opaqueToSample2}; + commandBuffers[currentFrame].pipelineBarrier2(depOpaqueToSample); + if (currentFrame < opaqueSceneColorImageLayouts.size()) { + opaqueSceneColorImageLayouts[currentFrame] = vk::ImageLayout::eShaderReadOnlyOptimal; + } + + // Make the swapchain image ready for color attachment output and clear it (Sync2) + vk::ImageMemoryBarrier2 swapchainToColor2{ + .srcStageMask = vk::PipelineStageFlagBits2::eBottomOfPipe, + .srcAccessMask = vk::AccessFlagBits2::eNone, + .dstStageMask = vk::PipelineStageFlagBits2::eColorAttachmentOutput, + .dstAccessMask = vk::AccessFlagBits2::eColorAttachmentWrite | vk::AccessFlagBits2::eColorAttachmentRead, + .oldLayout = vk::ImageLayout::eUndefined, + .newLayout = vk::ImageLayout::eColorAttachmentOptimal, + .srcQueueFamilyIndex = VK_QUEUE_FAMILY_IGNORED, + .dstQueueFamilyIndex = VK_QUEUE_FAMILY_IGNORED, + .image = swapChainImages[imageIndex], + .subresourceRange = {vk::ImageAspectFlagBits::eColor, 0, 1, 0, 1} + }; + vk::DependencyInfo depSwapchainToColor{.imageMemoryBarrierCount = 1, .pImageMemoryBarriers = &swapchainToColor2}; + commandBuffers[currentFrame].pipelineBarrier2(depSwapchainToColor); + + // Begin rendering to swapchain for composite + colorAttachments[0].imageView = *swapChainImageViews[imageIndex]; + colorAttachments[0].loadOp = vk::AttachmentLoadOp::eClear; // clear before composing base layer (full-screen composite overwrites all pixels) + depthAttachment.loadOp = vk::AttachmentLoadOp::eDontCare; // no depth for composite + renderingInfo.renderArea = vk::Rect2D({0, 0}, swapChainExtent); + // IMPORTANT: Composite pass does not use a depth attachment. Avoid binding it to satisfy dynamic rendering VUIDs. + auto savedDepthPtr = renderingInfo.pDepthAttachment; // save to restore later + renderingInfo.pDepthAttachment = nullptr; + commandBuffers[currentFrame].beginRendering(renderingInfo); + + // Bind composite pipeline + if (!!*compositePipeline) { + commandBuffers[currentFrame].bindPipeline(vk::PipelineBindPoint::eGraphics, *compositePipeline); + } + vk::Viewport vp(0.0f, 0.0f, static_cast(swapChainExtent.width), static_cast(swapChainExtent.height), 0.0f, 1.0f); + commandBuffers[currentFrame].setViewport(0, vp); + vk::Rect2D sc({0, 0}, swapChainExtent); + commandBuffers[currentFrame].setScissor(0, sc); + + // Bind descriptor set 0 for the composite. During loading, force fallback to avoid sampling uninitialized off-screen color. + vk::DescriptorSet setComposite = (transparentDescriptorSets.empty() || IsLoading()) + ? *transparentFallbackDescriptorSets[currentFrame] + : *transparentDescriptorSets[currentFrame]; + commandBuffers[currentFrame].bindDescriptorSets( + vk::PipelineBindPoint::eGraphics, + *compositePipelineLayout, + 0, + {setComposite}, + {}); + + // Push exposure/gamma and sRGB flag + struct CompositePush { + float exposure; + float gamma; + int outputIsSRGB; + float _pad; + } pc{}; + pc.exposure = std::clamp(this->exposure, 0.2f, 4.0f); + pc.gamma = this->gamma; + pc.outputIsSRGB = (swapChainImageFormat == vk::Format::eR8G8B8A8Srgb || swapChainImageFormat == vk::Format::eB8G8R8A8Srgb) ? 1 : 0; + commandBuffers[currentFrame].pushConstants(*compositePipelineLayout, vk::ShaderStageFlagBits::eFragment, 0, pc); + + // Draw fullscreen triangle + commandBuffers[currentFrame].draw(3, 1, 0, 0); + + commandBuffers[currentFrame].endRendering(); + // Restore depth attachment pointer for subsequent passes + renderingInfo.pDepthAttachment = savedDepthPtr; + } + // PASS 2: RENDER TRANSPARENT OBJECTS TO THE SWAPCHAIN + { + // Ensure depth attachment is bound again for the transparent pass + renderingInfo.pDepthAttachment = &depthAttachment; + colorAttachments[0].imageView = *swapChainImageViews[imageIndex]; + colorAttachments[0].loadOp = vk::AttachmentLoadOp::eLoad; + depthAttachment.loadOp = vk::AttachmentLoadOp::eLoad; + renderingInfo.renderArea = vk::Rect2D({0, 0}, swapChainExtent); + commandBuffers[currentFrame].beginRendering(renderingInfo); + commandBuffers[currentFrame].setViewport(0, viewport); + commandBuffers[currentFrame].setScissor(0, scissor); + + if (!transparentJobs.empty()) { + currentLayout = &pbrTransparentPipelineLayout; + vk::raii::Pipeline* activeTransparentPipeline = nullptr; + + for (const auto& job : transparentJobs) { + vk::raii::Pipeline* desiredPipeline = job.entityRes->cachedIsGlass ? &glassGraphicsPipeline : &pbrBlendGraphicsPipeline; + if (desiredPipeline != activeTransparentPipeline) { + commandBuffers[currentFrame].bindPipeline(vk::PipelineBindPoint::eGraphics, **desiredPipeline); + activeTransparentPipeline = desiredPipeline; + } + + std::array buffers = {*job.meshRes->vertexBuffer, *job.entityRes->instanceBuffer}; + std::array offsets = {0, 0}; + commandBuffers[currentFrame].bindVertexBuffers(0, buffers, offsets); + commandBuffers[currentFrame].bindIndexBuffer(*job.meshRes->indexBuffer, 0, vk::IndexType::eUint32); + + vk::DescriptorSet set1 = (transparentDescriptorSets.empty() || IsLoading()) + ? *transparentFallbackDescriptorSets[currentFrame] + : *transparentDescriptorSets[currentFrame]; + commandBuffers[currentFrame].bindDescriptorSets( + vk::PipelineBindPoint::eGraphics, + **currentLayout, + 0, + {*job.entityRes->pbrDescriptorSets[currentFrame], set1}, + {}); + + MaterialProperties pushConstants = job.entityRes->cachedMaterialProps; + if (job.entityRes->cachedIsLiquid) { + pushConstants.transmissionFactor = 0.0f; + } + commandBuffers[currentFrame].pushConstants < MaterialProperties > (**currentLayout, vk::ShaderStageFlagBits::eFragment, 0, { + pushConstants + } + ) + ; + uint32_t instanceCountT = std::max(1u, static_cast(job.meshComp->GetInstanceCount())); + commandBuffers[currentFrame].drawIndexed(job.meshRes->indexCount, instanceCountT, 0, 0, 0); + } + } + // End transparent rendering pass before any layout transitions (even if no transparent draws) + commandBuffers[currentFrame].endRendering(); + } { + // Screenshot and final present transition are handled in rasterization path only + // Ray query path handles these separately + + // Final layout transition for present (rasterization path only) + { + vk::ImageMemoryBarrier2 presentBarrier2{ + .srcStageMask = vk::PipelineStageFlagBits2::eColorAttachmentOutput, + .srcAccessMask = vk::AccessFlagBits2::eColorAttachmentWrite, + .dstStageMask = vk::PipelineStageFlagBits2::eNone, + .dstAccessMask = {}, + .oldLayout = vk::ImageLayout::eColorAttachmentOptimal, + .newLayout = vk::ImageLayout::ePresentSrcKHR, + .srcQueueFamilyIndex = VK_QUEUE_FAMILY_IGNORED, + .dstQueueFamilyIndex = VK_QUEUE_FAMILY_IGNORED, + .image = swapChainImages[imageIndex], + .subresourceRange = {vk::ImageAspectFlagBits::eColor, 0, 1, 0, 1} + }; + vk::DependencyInfo depToPresentFinal{.imageMemoryBarrierCount = 1, .pImageMemoryBarriers = &presentBarrier2}; + commandBuffers[currentFrame].pipelineBarrier2(depToPresentFinal); + if (imageIndex < swapChainImageLayouts.size()) + swapChainImageLayouts[imageIndex] = presentBarrier2.newLayout; + } + } + } // skip rasterization when ray query has rendered + + // Render ImGui UI overlay AFTER rasterization/ray query (must always execute regardless of render mode) + // ImGui expects Render() to be called every frame after NewFrame() - skipping it causes hangs + if (imguiSystem && !imguiSystem->IsFrameRendered()) { + // When ray query renders, swapchain is in PRESENT layout with valid content. + // When rasterization renders, swapchain is also in PRESENT layout with valid content. + // Transition to COLOR_ATTACHMENT with loadOp=eLoad to preserve existing pixels for ImGui overlay. + vk::ImageMemoryBarrier2 presentToColor{ + .srcStageMask = vk::PipelineStageFlagBits2::eBottomOfPipe, + .srcAccessMask = vk::AccessFlagBits2::eNone, + .dstStageMask = vk::PipelineStageFlagBits2::eColorAttachmentOutput, + .dstAccessMask = vk::AccessFlagBits2::eColorAttachmentWrite | vk::AccessFlagBits2::eColorAttachmentRead, + .oldLayout = (imageIndex < swapChainImageLayouts.size()) ? swapChainImageLayouts[imageIndex] : vk::ImageLayout::eUndefined, + .newLayout = vk::ImageLayout::eColorAttachmentOptimal, + .srcQueueFamilyIndex = VK_QUEUE_FAMILY_IGNORED, + .dstQueueFamilyIndex = VK_QUEUE_FAMILY_IGNORED, + .image = swapChainImages[imageIndex], + .subresourceRange = {vk::ImageAspectFlagBits::eColor, 0, 1, 0, 1} + }; + vk::DependencyInfo depInfo{.imageMemoryBarrierCount = 1, .pImageMemoryBarriers = &presentToColor}; + commandBuffers[currentFrame].pipelineBarrier2(depInfo); + if (imageIndex < swapChainImageLayouts.size()) + swapChainImageLayouts[imageIndex] = presentToColor.newLayout; + + // Begin a dedicated render pass for ImGui (UI overlay) + vk::RenderingAttachmentInfo imguiColorAttachment{ + .imageView = *swapChainImageViews[imageIndex], + .imageLayout = vk::ImageLayout::eColorAttachmentOptimal, + .loadOp = vk::AttachmentLoadOp::eLoad, // Load existing content + .storeOp = vk::AttachmentStoreOp::eStore + }; + vk::RenderingInfo imguiRenderingInfo{ + .renderArea = vk::Rect2D({0, 0}, swapChainExtent), + .layerCount = 1, + .colorAttachmentCount = 1, + .pColorAttachments = &imguiColorAttachment, + .pDepthAttachment = nullptr + }; + commandBuffers[currentFrame].beginRendering(imguiRenderingInfo); + + imguiSystem->Render(commandBuffers[currentFrame], currentFrame); + + commandBuffers[currentFrame].endRendering(); + + // Transition swapchain back to PRESENT layout after ImGui renders + vk::ImageMemoryBarrier2 colorToPresent{ + .srcStageMask = vk::PipelineStageFlagBits2::eColorAttachmentOutput, + .srcAccessMask = vk::AccessFlagBits2::eColorAttachmentWrite, + .dstStageMask = vk::PipelineStageFlagBits2::eBottomOfPipe, + .dstAccessMask = vk::AccessFlagBits2::eNone, + .oldLayout = vk::ImageLayout::eColorAttachmentOptimal, + .newLayout = vk::ImageLayout::ePresentSrcKHR, + .srcQueueFamilyIndex = VK_QUEUE_FAMILY_IGNORED, + .dstQueueFamilyIndex = VK_QUEUE_FAMILY_IGNORED, + .image = swapChainImages[imageIndex], + .subresourceRange = {vk::ImageAspectFlagBits::eColor, 0, 1, 0, 1} + }; + vk::DependencyInfo depInfoBack{.imageMemoryBarrierCount = 1, .pImageMemoryBarriers = &colorToPresent}; + commandBuffers[currentFrame].pipelineBarrier2(depInfoBack); + if (imageIndex < swapChainImageLayouts.size()) + swapChainImageLayouts[imageIndex] = colorToPresent.newLayout; + } + + commandBuffers[currentFrame].end(); + isRecordingCmd.store(false, std::memory_order_relaxed); + + // Submit and present (Synchronization 2) + uint64_t uploadsValueToWait = 0; + { + std::lock_guard lock(queueMutex); + uint64_t nextUp = nextUploadTimelineValue.load(std::memory_order_relaxed); + uploadsValueToWait = (nextUp > 0) ? (nextUp - 1) : 0; + } + + // Use acquireSemaphoreIndex for imageAvailable semaphore (same as we used in acquireNextImage) + // Use imageIndex for renderFinished semaphore (matches the image being presented) + + std::vector waitInfos = { + vk::SemaphoreSubmitInfo{ + .semaphore = *imageAvailableSemaphores[acquireSemaphoreIndex], + .value = 0, + .stageMask = vk::PipelineStageFlagBits2::eColorAttachmentOutput, + .deviceIndex = 0 + }, + vk::SemaphoreSubmitInfo{ + .semaphore = *uploadsTimeline, + .value = uploadsValueToWait, + .stageMask = vk::PipelineStageFlagBits2::eFragmentShader, + .deviceIndex = 0 + }, + // Wait-Before-Signal: Graphics waits for Physics simulation to complete + vk::SemaphoreSubmitInfo{ + .semaphore = *frameTimeline, + .value = currentTimelineValue + TimelineMilestones::ePhysicsFinished, + .stageMask = vk::PipelineStageFlagBits2::eVertexShader, + .deviceIndex = 0 + } + }; + + vk::CommandBufferSubmitInfo cmdInfo{.commandBuffer = *commandBuffers[currentFrame], .deviceMask = 0}; + + std::array signalInfos = { + vk::SemaphoreSubmitInfo{ + .semaphore = *renderFinishedSemaphores[imageIndex], + .value = 0, + .stageMask = vk::PipelineStageFlagBits2::eColorAttachmentOutput, + .deviceIndex = 0 + }, + vk::SemaphoreSubmitInfo{ + .semaphore = *frameTimeline, + .value = currentTimelineValue + TimelineMilestones::eGpuWorkFinished, + .stageMask = vk::PipelineStageFlagBits2::eAllCommands, + .deviceIndex = 0 + } + }; + + vk::SubmitInfo2 submit2{ + .waitSemaphoreInfoCount = static_cast(waitInfos.size()), + .pWaitSemaphoreInfos = waitInfos.data(), + .commandBufferInfoCount = 1, + .pCommandBufferInfos = &cmdInfo, + .signalSemaphoreInfoCount = static_cast(signalInfos.size()), + .pSignalSemaphoreInfos = signalInfos.data() + }; + + if (framebufferResized.load(std::memory_order_relaxed)) { + vk::SubmitInfo2 emptySubmit2{}; + uint64_t sig = currentTimelineValue + TimelineMilestones::eGpuWorkFinished; + vk::SemaphoreSubmitInfo resizeSignal{.semaphore = *frameTimeline, .value = sig, .stageMask = vk::PipelineStageFlagBits2::eAllCommands}; + emptySubmit2.signalSemaphoreInfoCount = 1; + emptySubmit2.pSignalSemaphoreInfos = &resizeSignal; + Submit2(*graphicsQueue, emptySubmit2, nullptr); + recreateSwapChain(); + return; + } + + // Update watchdog BEFORE queue submit because submit can block waiting for GPU + // This proves frame CPU work is complete even if GPU queue is busy + lastFrameUpdateTime.store(std::chrono::steady_clock::now(), std::memory_order_relaxed); + + // Submit work with monotonic timeline value (guarded by Submit2) + Submit2(*graphicsQueue, submit2, nullptr); + + vk::PresentInfoKHR presentInfo{.waitSemaphoreCount = 1, .pWaitSemaphores = &*renderFinishedSemaphores[imageIndex], .swapchainCount = 1, .pSwapchains = &*swapChain, .pImageIndices = &imageIndex}; + vk::Result presentResult = vk::Result::eSuccess; + try { + std::lock_guard lock(queueMutex); + presentResult = presentQueue.presentKHR(presentInfo); + } catch (const vk::OutOfDateKHRError&) { + framebufferResized.store(true, std::memory_order_relaxed); + } + if (presentResult == vk::Result::eSuboptimalKHR || framebufferResized.load(std::memory_order_relaxed)) { + framebufferResized.store(false, std::memory_order_relaxed); + recreateSwapChain(); + } else if (presentResult != vk::Result::eSuccess) { + throw std::runtime_error("Failed to present swap chain image"); + } + + currentFrame = (currentFrame + 1) % MAX_FRAMES_IN_FLIGHT; +} + +// Public toggle APIs for planar reflections (keyboard/UI) +void Renderer::SetPlanarReflectionsEnabled(bool enabled) { + // Flip mode and mark resources dirty so RTs are created/destroyed at the next safe point + enablePlanarReflections = enabled; + reflectionResourcesDirty = true; +} + +void Renderer::TogglePlanarReflections() { + SetPlanarReflectionsEnabled(!enablePlanarReflections); +} diff --git a/attachments/sync2_engine/renderer_resources.cpp b/attachments/sync2_engine/renderer_resources.cpp new file mode 100644 index 00000000..e47a2a2e --- /dev/null +++ b/attachments/sync2_engine/renderer_resources.cpp @@ -0,0 +1,4199 @@ +/* Copyright (c) 2025 Holochip Corporation + * + * SPDX-License-Identifier: Apache-2.0 + * + * Licensed under the Apache License, Version 2.0 the "License"; + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +#include "mesh_component.h" +#include "model_loader.h" +#include "renderer.h" +#include "transform_component.h" +#include +#include +#include +#include +#include +#include +#include +#include + +// stb_image dependency removed; all GLTF textures are uploaded via memory path from ModelLoader. + +// KTX2 support +#include +#include + +// This file contains resource-related methods from the Renderer class + +// Define shared default PBR texture identifiers (static constants) +const std::string Renderer::SHARED_DEFAULT_ALBEDO_ID = "__shared_default_albedo__"; +const std::string Renderer::SHARED_DEFAULT_NORMAL_ID = "__shared_default_normal__"; +const std::string Renderer::SHARED_DEFAULT_METALLIC_ROUGHNESS_ID = "__shared_default_metallic_roughness__"; +const std::string Renderer::SHARED_DEFAULT_OCCLUSION_ID = "__shared_default_occlusion__"; +const std::string Renderer::SHARED_DEFAULT_EMISSIVE_ID = "__shared_default_emissive__"; +const std::string Renderer::SHARED_BRIGHT_RED_ID = "__shared_bright_red__"; + +// Create depth resources +bool Renderer::createDepthResources() { + try { + // Find depth format + vk::Format depthFormat = findDepthFormat(); + + // Create depth image using memory pool + std::tie(depthImage, depthImageAllocation) = createImagePooled( + swapChainExtent.width, + swapChainExtent.height, + depthFormat, + vk::ImageTiling::eOptimal, + vk::ImageUsageFlagBits::eDepthStencilAttachment, + vk::MemoryPropertyFlagBits::eDeviceLocal); + + // Create depth image view + depthImageView = createImageView(depthImage, depthFormat, vk::ImageAspectFlagBits::eDepth); + + // Transition depth image layout + transitionImageLayout( + *depthImage, + depthFormat, + vk::ImageLayout::eUndefined, + vk::ImageLayout::eDepthStencilAttachmentOptimal); + + return true; + } catch (const std::exception& e) { + std::cerr << "Failed to create depth resources: " << e.what() << std::endl; + return false; + } +} + +// Helper: coerce an sRGB/UNORM variant of a given VkFormat while preserving block type where possible +static vk::Format CoerceFormatSRGB(vk::Format fmt, bool wantSRGB) { + switch (fmt) { + case vk::Format::eR8G8B8A8Unorm: + return wantSRGB ? vk::Format::eR8G8B8A8Srgb : vk::Format::eR8G8B8A8Unorm; + case vk::Format::eR8G8B8A8Srgb: + return wantSRGB ? vk::Format::eR8G8B8A8Srgb : vk::Format::eR8G8B8A8Unorm; + + case vk::Format::eBc1RgbUnormBlock: + return wantSRGB ? vk::Format::eBc1RgbSrgbBlock : vk::Format::eBc1RgbUnormBlock; + case vk::Format::eBc1RgbSrgbBlock: + return wantSRGB ? vk::Format::eBc1RgbSrgbBlock : vk::Format::eBc1RgbUnormBlock; + case vk::Format::eBc1RgbaUnormBlock: + return wantSRGB ? vk::Format::eBc1RgbaSrgbBlock : vk::Format::eBc1RgbaUnormBlock; + case vk::Format::eBc1RgbaSrgbBlock: + return wantSRGB ? vk::Format::eBc1RgbaSrgbBlock : vk::Format::eBc1RgbaUnormBlock; + + case vk::Format::eBc2UnormBlock: + return wantSRGB ? vk::Format::eBc2SrgbBlock : vk::Format::eBc2UnormBlock; + case vk::Format::eBc2SrgbBlock: + return wantSRGB ? vk::Format::eBc2SrgbBlock : vk::Format::eBc2UnormBlock; + + case vk::Format::eBc3UnormBlock: + return wantSRGB ? vk::Format::eBc3SrgbBlock : vk::Format::eBc3UnormBlock; + case vk::Format::eBc3SrgbBlock: + return wantSRGB ? vk::Format::eBc3SrgbBlock : vk::Format::eBc3UnormBlock; + + case vk::Format::eBc7UnormBlock: + return wantSRGB ? vk::Format::eBc7SrgbBlock : vk::Format::eBc7UnormBlock; + case vk::Format::eBc7SrgbBlock: + return wantSRGB ? vk::Format::eBc7SrgbBlock : vk::Format::eBc7UnormBlock; + + default: + return fmt; + } +} + +// Create texture image +bool Renderer::createTextureImage(const std::string& texturePath_, TextureResources& resources) { + try { + ensureThreadLocalVulkanInit(); + const std::string textureId = ResolveTextureId(texturePath_); + // Check if texture already exists + { + std::shared_lock texLock(textureResourcesMutex); + auto it = textureResources.find(textureId); + if (it != textureResources.end()) { + // Texture already loaded and cached; leave cache intact and return success + return true; + } + } + + // Resolve on-disk path (may differ from logical ID) + std::string resolvedPath = ResolvePath(textureId); + + // Ensure command pool is initialized before any GPU work + if (!*commandPool) { + std::cerr << "createTextureImage: commandPool not initialized yet for '" << textureId << "'" << std::endl; + return false; + } + + // Per-texture de-duplication (serialize loads of the same texture ID only) + { + std::unique_lock lk(textureLoadStateMutex); + while (texturesLoading.contains(textureId)) { + textureLoadStateCv.wait(lk); + } + } + // Double-check cache after the wait + { + std::shared_lock texLock(textureResourcesMutex); + auto it2 = textureResources.find(textureId); + if (it2 != textureResources.end()) { + return true; + } + } + // Mark as loading and ensure we notify on all exit paths + { + std::lock_guard lk(textureLoadStateMutex); + texturesLoading.insert(textureId); + } + auto _loadingGuard = std::unique_ptr>(reinterpret_cast(1), + [this, textureId](void*) { + std::lock_guard lk(textureLoadStateMutex); + texturesLoading.erase(textureId); + textureLoadStateCv.notify_all(); + }); + + // Check if this is a KTX2 file + bool isKtx2 = resolvedPath.ends_with(".ktx2"); + + // If it's a KTX2 texture but the path doesn't exist, try common fallback filename variants + if (isKtx2) { + std::filesystem::path origPath(resolvedPath); + if (!std::filesystem::exists(origPath)) { + std::string fname = origPath.filename().string(); + std::string dir = origPath.parent_path().string(); + auto tryCandidate = [&](const std::string& candidateName) -> bool { + std::filesystem::path cand = std::filesystem::path(dir) / candidateName; + if (std::filesystem::exists(cand)) { + std::cout << "Resolved missing texture '" << resolvedPath << "' to existing file '" << cand.string() << "'" << std::endl; + resolvedPath = cand.string(); + return true; + } + return false; + }; + // Known suffix variants near the end of filename before extension + // Examples: *_c.ktx2, *_d.ktx2, *_cm.ktx2, *_diffuse.ktx2, *_basecolor.ktx2, *_albedo.ktx2 + std::vector suffixes = {"_c", "_d", "_cm", "_diffuse", "_basecolor", "_albedo"}; + // If filename matches one known suffix, try others + for (const auto& s : suffixes) { + std::string key = s + ".ktx2"; + if (fname.ends_with(key)) { + std::string prefix = fname.substr(0, fname.size() - key.size()); + for (const auto& alt : suffixes) { + if (alt == s) + continue; + std::string candName = prefix + alt + ".ktx2"; + if (tryCandidate(candName)) { + isKtx2 = true; + break; + } + } + break; // Only replace last suffix occurrence + } + } + } + } + + int texWidth, texHeight, texChannels; + unsigned char* pixels = nullptr; + ktxTexture2* ktxTex = nullptr; + vk::DeviceSize imageSize; + + // Track KTX2 transcoding state across the function scope (BasisU only) + bool wasTranscoded = false; + // Track KTX2 header-provided VkFormat (0 == VK_FORMAT_UNDEFINED) + uint32_t headerVkFormatRaw = 0; + + uint32_t mipLevels = 1; + std::vector copyRegions; + + if (isKtx2) { + // Load KTX2 file + KTX_error_code result = ktxTexture2_CreateFromNamedFile(resolvedPath.c_str(), + KTX_TEXTURE_CREATE_LOAD_IMAGE_DATA_BIT, + &ktxTex); + if (result != KTX_SUCCESS) { + // Retry with sibling suffix variants if file exists but cannot be parsed/opened + std::filesystem::path origPath(resolvedPath); + std::string fname = origPath.filename().string(); + std::string dir = origPath.parent_path().string(); + auto tryLoad = [&](const std::string& candidateName) -> bool { + std::filesystem::path cand = std::filesystem::path(dir) / candidateName; + if (std::filesystem::exists(cand)) { + std::string candStr = cand.string(); + std::cout << "Retrying KTX2 load with sibling candidate '" << candStr << "' for original '" << resolvedPath << "'" << std::endl; + result = ktxTexture2_CreateFromNamedFile(candStr.c_str(), KTX_TEXTURE_CREATE_LOAD_IMAGE_DATA_BIT, &ktxTex); + if (result == KTX_SUCCESS) { + resolvedPath = candStr; // Use the successfully opened candidate + return true; + } + } + return false; + }; + // Known suffix variants near the end of filename before extension + std::vector suffixes = {"_c", "_d", "_cm", "_diffuse", "_basecolor", "_albedo"}; + for (const auto& s : suffixes) { + std::string key = s + ".ktx2"; + if (fname.ends_with(key)) { + std::string prefix = fname.substr(0, fname.size() - key.size()); + bool loaded = false; + for (const auto& alt : suffixes) { + if (alt == s) + continue; + std::string candName = prefix + alt + ".ktx2"; + if (tryLoad(candName)) { + loaded = true; + break; + } + } + if (loaded) + break; + } + } + } + + // Bail out if we still failed to load + if (result != KTX_SUCCESS || ktxTex == nullptr) { + std::cerr << "Failed to load KTX2 texture: " << resolvedPath << " (error: " << result << ")" << std::endl; + return false; + } + + // Read header-provided vkFormat (if already GPU-compressed/transcoded offline) + headerVkFormatRaw = static_cast(ktxTex->vkFormat); + + // Check if the texture needs BasisU transcoding; prefer GPU-compressed targets to save VRAM + wasTranscoded = ktxTexture2_NeedsTranscoding(ktxTex); + if (wasTranscoded) { + // Select a compressed target supported by the device (prefer BC7 RGBA, then BC3 RGBA, then BC1 RGB) + auto supportsFormat = [&](vk::Format f) { + auto props = physicalDevice.getFormatProperties(f); + return static_cast(props.optimalTilingFeatures & vk::FormatFeatureFlagBits::eSampledImage); + }; + bool wantSrgb = (Renderer::determineTextureFormat(resolvedPath) == vk::Format::eR8G8B8A8Srgb); + KTX_error_code tcErr = KTX_SUCCESS; + if (supportsFormat(vk::Format::eBc7UnormBlock) || supportsFormat(vk::Format::eBc7SrgbBlock)) { + tcErr = ktxTexture2_TranscodeBasis(ktxTex, KTX_TTF_BC7_RGBA, 0); + } else if (supportsFormat(vk::Format::eBc3UnormBlock) || supportsFormat(vk::Format::eBc3SrgbBlock)) { + tcErr = ktxTexture2_TranscodeBasis(ktxTex, KTX_TTF_BC3_RGBA, 0); + } else if (supportsFormat(vk::Format::eBc1RgbUnormBlock) || supportsFormat(vk::Format::eBc1RgbSrgbBlock)) { + tcErr = ktxTexture2_TranscodeBasis(ktxTex, KTX_TTF_BC1_RGB, 0); + } else { + // Fallback to RGBA32 if no BC formats are supported + tcErr = ktxTexture2_TranscodeBasis(ktxTex, KTX_TTF_RGBA32, 0); + } + if (tcErr != KTX_SUCCESS) { + std::cerr << "Failed to transcode KTX2 BasisU texture: " << resolvedPath << " (error: " << tcErr << ")" << std::endl; + ktxTexture_Destroy(reinterpret_cast(ktxTex)); + return false; + } + } + + texWidth = ktxTex->baseWidth; + texHeight = ktxTex->baseHeight; + texChannels = 4; // logical channels; compressed size handled by libktx + + // Use all levels present in the KTX container + mipLevels = std::max(1u, ktxTex->numLevels); + + // Total data size across all mip levels + imageSize = ktxTexture_GetDataSize(reinterpret_cast(ktxTex)); + + // Build copy regions for every mip level in the file + copyRegions.clear(); + copyRegions.reserve(mipLevels); + for (uint32_t level = 0; level < mipLevels; ++level) { + ktx_size_t levelOffset = 0; + ktxTexture_GetImageOffset(reinterpret_cast(ktxTex), level, 0, 0, &levelOffset); + uint32_t w = std::max(1u, static_cast(texWidth) >> level); + uint32_t h = std::max(1u, static_cast(texHeight) >> level); + copyRegions.push_back({ + .bufferOffset = static_cast(levelOffset), + .bufferRowLength = 0, + .bufferImageHeight = 0, + .imageSubresource = { + .aspectMask = vk::ImageAspectFlagBits::eColor, + .mipLevel = level, + .baseArrayLayer = 0, + .layerCount = 1 + }, + .imageOffset = {0, 0, 0}, + .imageExtent = {w, h, 1} + }); + } + } else { + // Non-KTX texture loading via file path is disabled to simplify pipeline. + std::cerr << "Unsupported non-KTX2 texture path: " << textureId << std::endl; + return false; + } + + // Create staging buffer + auto [stagingBuffer, stagingBufferMemory] = createBuffer( + imageSize, + vk::BufferUsageFlagBits::eTransferSrc, + vk::MemoryPropertyFlagBits::eHostVisible | vk::MemoryPropertyFlagBits::eHostCoherent); + + // Copy pixel data to staging buffer + void* data = stagingBufferMemory.mapMemory(0, imageSize); + + if (isKtx2) { + // Copy entire KTX2 image data blob (all mip levels) + const uint8_t* allData = ktxTexture_GetData(reinterpret_cast(ktxTex)); + const ktx_size_t dataSz = ktxTexture_GetDataSize(reinterpret_cast(ktxTex)); + memcpy(data, allData, static_cast(dataSz)); + } else { + // Copy regular image data + memcpy(data, pixels, static_cast(imageSize)); + } + + stagingBufferMemory.unmapMemory(); + + // Determine appropriate texture format + vk::Format textureFormat; + const bool wantSRGB = (Renderer::determineTextureFormat(textureId) == vk::Format::eR8G8B8A8Srgb); + bool alphaMaskedHint = false; + if (isKtx2) { + // If the KTX2 provided a valid VkFormat and we did NOT transcode, respect its block type + // but coerce the sRGB/UNORM variant based on texture usage (baseColor vs data maps) + if (!wasTranscoded) { + VkFormat headerFmt = static_cast(headerVkFormatRaw); + if (headerFmt != VK_FORMAT_UNDEFINED) { + textureFormat = CoerceFormatSRGB(static_cast(headerFmt), wantSRGB); + } else { + textureFormat = wantSRGB ? vk::Format::eR8G8B8A8Srgb : vk::Format::eR8G8B8A8Unorm; + } + // Can't easily scan alpha in compressed formats here; leave hint at default false + } else { + // We transcoded; choose a Vulkan format matching the transcode target (we requested BC7/BC3/BC1 or RGBA32 fallback) + // There is no direct query from KTX for chosen VkFormat after transcoding, so infer by capabilities using our preference order. + bool wantSRGB2 = wantSRGB; + if (!!physicalDevice.getFormatProperties(vk::Format::eBc7UnormBlock).optimalTilingFeatures || !!physicalDevice.getFormatProperties(vk::Format::eBc7SrgbBlock).optimalTilingFeatures) { + textureFormat = wantSRGB2 ? vk::Format::eBc7SrgbBlock : vk::Format::eBc7UnormBlock; + } else if (!!physicalDevice.getFormatProperties(vk::Format::eBc3UnormBlock).optimalTilingFeatures || !!physicalDevice.getFormatProperties(vk::Format::eBc3SrgbBlock).optimalTilingFeatures) { + textureFormat = wantSRGB2 ? vk::Format::eBc3SrgbBlock : vk::Format::eBc3UnormBlock; + } else if (!!physicalDevice.getFormatProperties(vk::Format::eBc1RgbUnormBlock).optimalTilingFeatures || !!physicalDevice.getFormatProperties(vk::Format::eBc1RgbSrgbBlock).optimalTilingFeatures) { + textureFormat = wantSRGB2 ? vk::Format::eBc1RgbSrgbBlock : vk::Format::eBc1RgbUnormBlock; + } else { + // Fallback to uncompressed RGBA + textureFormat = wantSRGB2 ? vk::Format::eR8G8B8A8Srgb : vk::Format::eR8G8B8A8Unorm; + // We have CPU-visible RGBA data; detect alpha for masked hint + ktx_size_t offsetScan = 0; + ktxTexture_GetImageOffset(reinterpret_cast(ktxTex), 0, 0, 0, &offsetScan); + const uint8_t* rgba = ktxTexture_GetData(reinterpret_cast(ktxTex)) + offsetScan; + size_t pixelCount = static_cast(texWidth) * static_cast(texHeight); + for (size_t i = 0; i < pixelCount; ++i) { + if (rgba[i * 4 + 3] < 250) { + alphaMaskedHint = true; + break; + } + } + } + } + } else { + textureFormat = wantSRGB ? vk::Format::eR8G8B8A8Srgb : vk::Format::eR8G8B8A8Unorm; + } + + // Now that we're done reading libktx data, destroy the KTX texture to avoid leaks + if (isKtx2 && ktxTex) { + ktxTexture_Destroy(reinterpret_cast(ktxTex)); + ktxTex = nullptr; + } + + // Create texture image using memory pool + bool differentFamilies = queueFamilyIndices.graphicsFamily.value() != queueFamilyIndices.transferFamily.value(); + std::vector families; + if (differentFamilies) { + families = {queueFamilyIndices.graphicsFamily.value(), queueFamilyIndices.transferFamily.value()}; + } + // KTX2 mip levels are set above (line 306); mipLevels already reflects what the file contains + // KTX2 files come with pre-generated mips, so we don't need TRANSFER_SRC for blit generation + vk::ImageUsageFlags usageFlags = vk::ImageUsageFlagBits::eTransferDst | vk::ImageUsageFlagBits::eSampled; + + // Create image with OOM fallback: retry with mipLevels=1 and reduced usage if needed + try { + auto [textureImg, textureImgAllocation] = createImagePooled( + texWidth, + texHeight, + textureFormat, + vk::ImageTiling::eOptimal, + usageFlags, + vk::MemoryPropertyFlagBits::eDeviceLocal, + /*mipLevels*/ + mipLevels, + differentFamilies ? vk::SharingMode::eConcurrent : vk::SharingMode::eExclusive, + families); + resources.textureImage = std::move(textureImg); + resources.textureImageAllocation = std::move(textureImgAllocation); + } catch (const std::exception& e) { + std::cerr << "Image allocation failed (" << resolvedPath << "): " << e.what() << ". Retrying with mipLevels=1..." << std::endl; + // Retry with a single mip level and no TRANSFER_SRC usage to reduce memory pressure + mipLevels = 1; + usageFlags &= ~vk::ImageUsageFlagBits::eTransferSrc; + auto [textureImg2, textureImgAllocation2] = createImagePooled( + texWidth, + texHeight, + textureFormat, + vk::ImageTiling::eOptimal, + usageFlags, + vk::MemoryPropertyFlagBits::eDeviceLocal, + /*mipLevels*/ + mipLevels, + differentFamilies ? vk::SharingMode::eConcurrent : vk::SharingMode::eExclusive, + families); + resources.textureImage = std::move(textureImg2); + resources.textureImageAllocation = std::move(textureImgAllocation2); + } + + // GPU upload for this texture (copies all regions provided) + uploadImageFromStaging(*stagingBuffer, *resources.textureImage, textureFormat, copyRegions, mipLevels, imageSize); + + // KTX2 files provide their own mip levels; no runtime generation needed + // Store the format and mipLevels for createTextureImageView + resources.format = textureFormat; + resources.mipLevels = mipLevels; + resources.alphaMaskedHint = alphaMaskedHint; + + // Create texture image view + if (!createTextureImageView(resources)) { + return false; + } + + // Create texture sampler + if (!createTextureSampler(resources)) { + return false; + } + + // Add to texture resources map (guarded) + { + std::unique_lock texLock(textureResourcesMutex); + textureResources[textureId] = std::move(resources); + } + + return true; + } catch (const std::exception& e) { + std::cerr << "Failed to create texture image: " << e.what() << std::endl; + return false; + } +} + +// Create texture image view +bool Renderer::createTextureImageView(TextureResources& resources) { + try { + ensureThreadLocalVulkanInit(); + resources.textureImageView = createImageView( + resources.textureImage, + resources.format, + // Use the stored format instead of hardcoded sRGB + vk::ImageAspectFlagBits::eColor, + resources.mipLevels // Use the stored mipLevels + ); + return true; + } catch (const std::exception& e) { + std::cerr << "Failed to create texture image view: " << e.what() << std::endl; + return false; + } +} + +// Create shared default PBR textures (to avoid creating hundreds of identical textures) +bool Renderer::createSharedDefaultPBRTextures() { + try { + unsigned char translucentPixel[4] = {128, 128, 128, 128}; // 50% alpha (128/255) + if (!LoadTextureFromMemory(SHARED_DEFAULT_ALBEDO_ID, translucentPixel, 1, 1, 4)) { + std::cerr << "Failed to create shared default albedo texture" << std::endl; + return false; + } + + // Create shared default normal texture (flat normal) + unsigned char normalPixel[4] = {128, 128, 255, 255}; // (0.5, 0.5, 1.0, 1.0) in 0-255 range + if (!LoadTextureFromMemory(SHARED_DEFAULT_NORMAL_ID, normalPixel, 1, 1, 4)) { + std::cerr << "Failed to create shared default normal texture" << std::endl; + return false; + } + + // Create shared metallic-roughness texture (non-metallic, fully rough) + unsigned char metallicRoughnessPixel[4] = {0, 255, 0, 255}; // (unused, roughness=1.0, metallic=0.0, alpha=1.0) + if (!LoadTextureFromMemory(SHARED_DEFAULT_METALLIC_ROUGHNESS_ID, metallicRoughnessPixel, 1, 1, 4)) { + std::cerr << "Failed to create shared default metallic-roughness texture" << std::endl; + return false; + } + + // Create shared default occlusion texture (white - no occlusion) + unsigned char occlusionPixel[4] = {255, 255, 255, 255}; + if (!LoadTextureFromMemory(SHARED_DEFAULT_OCCLUSION_ID, occlusionPixel, 1, 1, 4)) { + std::cerr << "Failed to create shared default occlusion texture" << std::endl; + return false; + } + + // Create shared default emissive texture (black - no emission) + unsigned char emissivePixel[4] = {0, 0, 0, 255}; + if (!LoadTextureFromMemory(SHARED_DEFAULT_EMISSIVE_ID, emissivePixel, 1, 1, 4)) { + std::cerr << "Failed to create shared default emissive texture" << std::endl; + return false; + } + + // Create shared bright red texture for ball visibility + unsigned char brightRedPixel[4] = {255, 0, 0, 255}; // Bright red (R=255, G=0, B=0, A=255) + if (!LoadTextureFromMemory(SHARED_BRIGHT_RED_ID, brightRedPixel, 1, 1, 4)) { + std::cerr << "Failed to create shared bright red texture" << std::endl; + return false; + } + + return true; + } catch (const std::exception& e) { + std::cerr << "Failed to create shared default PBR textures: " << e.what() << std::endl; + return false; + } +} + +// Create default texture resources (1x1 white texture) +bool Renderer::createDefaultTextureResources() { + try { + // Create a 1x1 white texture + const uint32_t width = 1; + const uint32_t height = 1; + const uint32_t pixelSize = 4; // RGBA + const std::vector pixels = {255, 255, 255, 255}; // White pixel (RGBA) + + // Create staging buffer + vk::DeviceSize imageSize = width * height * pixelSize; + auto [stagingBuffer, stagingBufferMemory] = createBuffer( + imageSize, + vk::BufferUsageFlagBits::eTransferSrc, + vk::MemoryPropertyFlagBits::eHostVisible | vk::MemoryPropertyFlagBits::eHostCoherent); + + // Copy pixel data to staging buffer + void* data = stagingBufferMemory.mapMemory(0, imageSize); + memcpy(data, pixels.data(), static_cast(imageSize)); + stagingBufferMemory.unmapMemory(); + + // Create texture image using memory pool + auto [textureImg, textureImgAllocation] = createImagePooled( + width, + height, + vk::Format::eR8G8B8A8Srgb, + vk::ImageTiling::eOptimal, + vk::ImageUsageFlagBits::eTransferDst | vk::ImageUsageFlagBits::eSampled, + vk::MemoryPropertyFlagBits::eDeviceLocal); + + defaultTextureResources.textureImage = std::move(textureImg); + defaultTextureResources.textureImageAllocation = std::move(textureImgAllocation); + + // Transition image layout for copy + transitionImageLayout( + *defaultTextureResources.textureImage, + vk::Format::eR8G8B8A8Srgb, + vk::ImageLayout::eUndefined, + vk::ImageLayout::eTransferDstOptimal); + + // Copy buffer to image + vk::BufferImageCopy region{ + .bufferOffset = 0, + .bufferRowLength = 0, + .bufferImageHeight = 0, + .imageSubresource = { + .aspectMask = vk::ImageAspectFlagBits::eColor, + .mipLevel = 0, + .baseArrayLayer = 0, + .layerCount = 1 + }, + .imageOffset = {0, 0, 0}, + .imageExtent = {width, height, 1} + }; + copyBufferToImage( + *stagingBuffer, + *defaultTextureResources.textureImage, + width, + height, + region); + + // Transition image layout for shader access + transitionImageLayout( + *defaultTextureResources.textureImage, + vk::Format::eR8G8B8A8Srgb, + vk::ImageLayout::eTransferDstOptimal, + vk::ImageLayout::eShaderReadOnlyOptimal); + + // Create texture image view + defaultTextureResources.textureImageView = createImageView( + defaultTextureResources.textureImage, + vk::Format::eR8G8B8A8Srgb, + vk::ImageAspectFlagBits::eColor); + + // Create texture sampler + return createTextureSampler(defaultTextureResources); + } catch (const std::exception& e) { + std::cerr << "Failed to create default texture resources: " << e.what() << std::endl; + return false; + } +} + +// Create texture sampler +bool Renderer::createTextureSampler(TextureResources& resources) { + try { + ensureThreadLocalVulkanInit(); + // Get physical device properties + vk::PhysicalDeviceProperties properties = physicalDevice.getProperties(); + + // Create sampler with mipmapping + anisotropy (clamped to device limit) + float deviceMaxAniso = properties.limits.maxSamplerAnisotropy; + float desiredAniso = std::clamp(samplerMaxAnisotropy, 1.0f, deviceMaxAniso); + float maxLod = resources.mipLevels > 1 ? static_cast(resources.mipLevels - 1) : 0.0f; + vk::SamplerCreateInfo samplerInfo{ + .magFilter = vk::Filter::eLinear, + .minFilter = vk::Filter::eLinear, + .mipmapMode = vk::SamplerMipmapMode::eLinear, + .addressModeU = vk::SamplerAddressMode::eRepeat, + .addressModeV = vk::SamplerAddressMode::eRepeat, + .addressModeW = vk::SamplerAddressMode::eRepeat, + .mipLodBias = 0.0f, + .anisotropyEnable = desiredAniso > 1.0f ? VK_TRUE : VK_FALSE, + .maxAnisotropy = desiredAniso, + .compareEnable = VK_FALSE, + .compareOp = vk::CompareOp::eAlways, + .minLod = 0.0f, + .maxLod = maxLod, + .borderColor = vk::BorderColor::eIntOpaqueBlack, + .unnormalizedCoordinates = VK_FALSE + }; + + resources.textureSampler = vk::raii::Sampler(device, samplerInfo); + return true; + } catch (const std::exception& e) { + std::cerr << "Failed to create texture sampler: " << e.what() << std::endl; + return false; + } +} + +// Load texture from file (public wrapper for createTextureImage) +bool Renderer::LoadTexture(const std::string& texturePath) { + ensureThreadLocalVulkanInit(); + if (texturePath.empty()) { + std::cerr << "LoadTexture: Empty texture path provided" << std::endl; + return false; + } + + // Resolve aliases (canonical ID -> actual key) + const std::string resolvedId = ResolveTextureId(texturePath); + + // Check if texture is already loaded + { + std::shared_lock texLock(textureResourcesMutex); + auto it = textureResources.find(resolvedId); + if (it != textureResources.end()) { + // Texture already loaded + return true; + } + } + + // Create temporary texture resources (unused output; cache will be populated internally) + TextureResources tempResources; + + // Use existing createTextureImage method (it inserts into textureResources on success) if it's a KTX2 path; otherwise fall back to memory path below + bool success = false; + if (resolvedId.ends_with(".ktx2")) { + success = createTextureImage(resolvedId, tempResources); + if (success) + return true; + // Fall through to raw-memory path if KTX load failed + } + + if (!success) { + std::cerr << "Failed to load texture: " << texturePath << std::endl; + } + + return success; +} + +// Determine appropriate texture format based on texture type +vk::Format Renderer::determineTextureFormat(const std::string& textureId) { + // Determine sRGB vs Linear in a case-insensitive way + std::string idLower = textureId; + std::ranges::transform(idLower, idLower.begin(), [](unsigned char c) { return static_cast(std::tolower(c)); }); + + // BaseColor/Albedo/Diffuse & SpecGloss RGB should be sRGB for proper gamma correction + if (idLower.find("basecolor") != std::string::npos || + idLower.find("base_color") != std::string::npos || + idLower.find("albedo") != std::string::npos || + idLower.find("diffuse") != std::string::npos || + idLower.find("specgloss") != std::string::npos || + idLower.find("specularglossiness") != std::string::npos || + textureId == Renderer::SHARED_DEFAULT_ALBEDO_ID) { + return vk::Format::eR8G8B8A8Srgb; + } + + // Emissive is color data and should be sampled in sRGB + if (idLower.find("emissive") != std::string::npos || + textureId == Renderer::SHARED_DEFAULT_EMISSIVE_ID) { + return vk::Format::eR8G8B8A8Srgb; + } + + // Shared bright red (ball) is a color texture; ensure sRGB for vivid appearance + if (textureId == Renderer::SHARED_BRIGHT_RED_ID) { + return vk::Format::eR8G8B8A8Srgb; + } + + // All other PBR textures (normal, metallic-roughness, occlusion) should be linear + // because they contain non-color data that shouldn't be gamma corrected + return vk::Format::eR8G8B8A8Unorm; +} + +// Load texture from raw image data in memory +bool Renderer::LoadTextureFromMemory(const std::string& textureId, + const unsigned char* imageData, + int width, + int height, + int channels) { + ensureThreadLocalVulkanInit(); + const std::string resolvedId = ResolveTextureId(textureId); + std::cout << "[LoadTextureFromMemory] start id=" << textureId << " -> resolved=" << resolvedId << " size=" << width << "x" << height << " ch=" << channels << std::endl; + if (resolvedId.empty() || !imageData || width <= 0 || height <= 0 || channels <= 0) { + std::cerr << "LoadTextureFromMemory: Invalid parameters" << std::endl; + return false; + } + + // Check if texture is already loaded + { + std::shared_lock texLock(textureResourcesMutex); + auto it = textureResources.find(resolvedId); + if (it != textureResources.end()) { + // Texture already loaded + return true; + } + } + + // Per-texture de-duplication (serialize loads of the same texture ID only) + { + std::unique_lock lk(textureLoadStateMutex); + while (texturesLoading.contains(resolvedId)) { + textureLoadStateCv.wait(lk); + } + } + // Double-check cache after the wait + { + std::shared_lock texLock(textureResourcesMutex); + auto it2 = textureResources.find(resolvedId); + if (it2 != textureResources.end()) { + return true; + } + } + // Mark as loading and ensure we notify on all exit paths + { + std::lock_guard lk(textureLoadStateMutex); + texturesLoading.insert(resolvedId); + } + auto _loadingGuard = std::unique_ptr>(reinterpret_cast(1), + [this, resolvedId](void*) { + std::lock_guard lk(textureLoadStateMutex); + texturesLoading.erase(resolvedId); + textureLoadStateCv.notify_all(); + }); + + try { + TextureResources resources; + + // Calculate image size (ensure 4 channels for RGBA) + int targetChannels = 4; // Always use RGBA for consistency + vk::DeviceSize imageSize = width * height * targetChannels; + + // Create a staging buffer + auto [stagingBuffer, stagingBufferMemory] = createBuffer( + imageSize, + vk::BufferUsageFlagBits::eTransferSrc, + vk::MemoryPropertyFlagBits::eHostVisible | vk::MemoryPropertyFlagBits::eHostCoherent); + + // Copy and convert pixel data to staging buffer + void* data = stagingBufferMemory.mapMemory(0, imageSize); + auto* stagingData = static_cast(data); + + if (channels == 4) { + // Already RGBA, direct copy + memcpy(stagingData, imageData, imageSize); + } else if (channels == 3) { + // RGB to RGBA conversion + for (int i = 0; i < width * height; ++i) { + stagingData[i * 4 + 0] = imageData[i * 3 + 0]; // R + stagingData[i * 4 + 1] = imageData[i * 3 + 1]; // G + stagingData[i * 4 + 2] = imageData[i * 3 + 2]; // B + stagingData[i * 4 + 3] = 255; // A + } + } else if (channels == 2) { + // Grayscale + Alpha to RGBA conversion + for (int i = 0; i < width * height; ++i) { + stagingData[i * 4 + 0] = imageData[i * 2 + 0]; // R (grayscale) + stagingData[i * 4 + 1] = imageData[i * 2 + 0]; // G (grayscale) + stagingData[i * 4 + 2] = imageData[i * 2 + 0]; // B (grayscale) + stagingData[i * 4 + 3] = imageData[i * 2 + 1]; // A (alpha) + } + } else if (channels == 1) { + // Grayscale to RGBA conversion + for (int i = 0; i < width * height; ++i) { + stagingData[i * 4 + 0] = imageData[i]; // R + stagingData[i * 4 + 1] = imageData[i]; // G + stagingData[i * 4 + 2] = imageData[i]; // B + stagingData[i * 4 + 3] = 255; // A + } + } else { + std::cerr << "LoadTextureFromMemory: Unsupported channel count: " << channels << std::endl; + stagingBufferMemory.unmapMemory(); + return false; + } + + // Analyze alpha to set alphaMaskedHint (treat as masked if any pixel alpha < ~1.0) + bool alphaMaskedHint = false; + for (int i = 0, n = width * height; i < n; ++i) { + if (stagingData[i * 4 + 3] < 250) { + alphaMaskedHint = true; + break; + } + } + + stagingBufferMemory.unmapMemory(); + + // Determine the appropriate texture format based on the texture type + vk::Format textureFormat = determineTextureFormat(textureId); + + // Create texture image using memory pool (with optional mipmap generation) + bool differentFamilies = queueFamilyIndices.graphicsFamily.value() != queueFamilyIndices.transferFamily.value(); + std::vector families; + if (differentFamilies) { + families = {queueFamilyIndices.graphicsFamily.value(), queueFamilyIndices.transferFamily.value()}; + } + // Decide mip count and usage for memory textures; cap to reduce VRAM pressure + uint32_t mipLevels = 1; + if (width > 1 && height > 1) { + uint32_t full = static_cast(std::floor(std::log2(std::max(width, height)))) + 1; + mipLevels = std::max(1u, std::min(full, maxAutoGeneratedMipLevels)); + } + vk::ImageUsageFlags usageFlags = vk::ImageUsageFlagBits::eTransferDst | vk::ImageUsageFlagBits::eSampled; + if (mipLevels > 1) + usageFlags |= vk::ImageUsageFlagBits::eTransferSrc; + + // OOM-resilient allocation + try { + auto [textureImg, textureImgAllocation] = createImagePooled( + width, + height, + textureFormat, + vk::ImageTiling::eOptimal, + usageFlags, + vk::MemoryPropertyFlagBits::eDeviceLocal, + mipLevels, + differentFamilies ? vk::SharingMode::eConcurrent : vk::SharingMode::eExclusive, + families); + + resources.textureImage = std::move(textureImg); + resources.textureImageAllocation = std::move(textureImgAllocation); + } catch (const std::exception& e) { + std::cerr << "Image allocation failed (memory texture): " << e.what() << ". Retrying with mipLevels=1..." << std::endl; + mipLevels = 1; + usageFlags &= ~vk::ImageUsageFlagBits::eTransferSrc; + auto [textureImg, textureImgAllocation] = createImagePooled( + width, + height, + textureFormat, + vk::ImageTiling::eOptimal, + usageFlags, + vk::MemoryPropertyFlagBits::eDeviceLocal, + mipLevels, + differentFamilies ? vk::SharingMode::eConcurrent : vk::SharingMode::eExclusive, + families); + resources.textureImage = std::move(textureImg); + resources.textureImageAllocation = std::move(textureImgAllocation); + } + + // GPU upload. Copy buffer to image in a single submit. + vk::BufferImageCopy region{ + .bufferOffset = 0, + .bufferRowLength = 0, + .bufferImageHeight = 0, + .imageSubresource = { + .aspectMask = vk::ImageAspectFlagBits::eColor, + .mipLevel = 0, + .baseArrayLayer = 0, + .layerCount = 1 + }, + .imageOffset = {0, 0, 0}, + .imageExtent = {static_cast(width), static_cast(height), 1} + }; + uploadImageFromStaging(*stagingBuffer, *resources.textureImage, textureFormat, region, mipLevels, imageSize); + + // Generate mip chain if requested and format is uncompressed RGBA + if (mipLevels > 1 && (textureFormat == vk::Format::eR8G8B8A8Srgb || textureFormat == vk::Format::eR8G8B8A8Unorm)) { + generateMipmaps(*resources.textureImage, textureFormat, width, height, mipLevels); + } + + // Store the format for createTextureImageView + resources.format = textureFormat; + resources.mipLevels = mipLevels; + resources.alphaMaskedHint = alphaMaskedHint; + + // Use resolvedId as the cache key to avoid duplicates + const std::string& cacheId = resolvedId; + + // Create texture image view + resources.textureImageView = createImageView( + resources.textureImage, + textureFormat, + vk::ImageAspectFlagBits::eColor, + mipLevels); + + // Create texture sampler + if (!createTextureSampler(resources)) { + return false; + } + + // Add to texture resources map (guarded) + { + std::unique_lock texLock(textureResourcesMutex); + textureResources[cacheId] = std::move(resources); + } + + std::cout << "Successfully loaded texture from memory: " << cacheId + << " (" << width << "x" << height << ", " << channels << " channels)" << std::endl; + return true; + } catch (const std::exception& e) { + std::cerr << "Failed to load texture from memory: " << e.what() << std::endl; + return false; + } +} + +// Create mesh resources +bool Renderer::createMeshResources(MeshComponent* meshComponent, bool deferUpload) { + ensureThreadLocalVulkanInit(); + try { + // If resources already exist, check if we need to flush staging buffers + { + std::shared_lock lock(meshResourcesMutex); + auto it = meshResources.find(meshComponent); + if (it != meshResources.end()) { + if (!deferUpload) { + MeshResources& res = it->second; + if ((res.vertexBufferSizeBytes > 0 && !!*res.stagingVertexBuffer && !!*res.vertexBuffer) || + (res.indexBufferSizeBytes > 0 && !!*res.stagingIndexBuffer && !!*res.indexBuffer)) { + // Need unique lock to modify res (and meshResources values) + lock.unlock(); + std::unique_lock uniqueLock(meshResourcesMutex); + auto itUnique = meshResources.find(meshComponent); + if (itUnique != meshResources.end()) { + MeshResources& resU = itUnique->second; + if (resU.vertexBufferSizeBytes > 0 && !!*resU.stagingVertexBuffer && !!*resU.vertexBuffer) { + copyBuffer(resU.stagingVertexBuffer, resU.vertexBuffer, resU.vertexBufferSizeBytes); + resU.stagingVertexBuffer = vk::raii::Buffer(nullptr); + resU.stagingVertexBufferAllocation = nullptr; + resU.vertexBufferSizeBytes = 0; + } + if (resU.indexBufferSizeBytes > 0 && !!*resU.stagingIndexBuffer && !!*resU.indexBuffer) { + copyBuffer(resU.stagingIndexBuffer, resU.indexBuffer, resU.indexBufferSizeBytes); + resU.stagingIndexBuffer = vk::raii::Buffer(nullptr); + resU.stagingIndexBufferAllocation = nullptr; + resU.indexBufferSizeBytes = 0; + } + } + } + } + return true; + } + } + + // Get mesh data + const auto& vertices = meshComponent->GetVertices(); + const auto& indices = meshComponent->GetIndices(); + + if (vertices.empty() || indices.empty()) { + std::cerr << "Mesh has no vertices or indices" << std::endl; + return false; + } + + // --- 1. Create and fill per-mesh staging buffers using memory pool --- + vk::DeviceSize vertexBufferSize = sizeof(vertices[0]) * vertices.size(); + KickWatchdog(); + auto [stagingVertexBuffer, stagingVertexBufferAllocation] = createBufferPooled( + vertexBufferSize, + vk::BufferUsageFlagBits::eTransferSrc, + vk::MemoryPropertyFlagBits::eHostVisible | vk::MemoryPropertyFlagBits::eHostCoherent); + + if (!stagingVertexBufferAllocation) { + throw std::runtime_error("Failed to allocate staging vertex buffer from memory pool"); + } + void* vertexData = stagingVertexBufferAllocation->mappedPtr; + if (vertexData) { + std::memcpy(vertexData, vertices.data(), static_cast(vertexBufferSize)); + } + + KickWatchdog(); + vk::DeviceSize indexBufferSize = sizeof(indices[0]) * indices.size(); + auto [stagingIndexBuffer, stagingIndexBufferAllocation] = createBufferPooled( + indexBufferSize, + vk::BufferUsageFlagBits::eTransferSrc, + vk::MemoryPropertyFlagBits::eHostVisible | vk::MemoryPropertyFlagBits::eHostCoherent); + + if (!stagingIndexBufferAllocation) { + throw std::runtime_error("Failed to allocate staging index buffer from memory pool"); + } + void* indexData = stagingIndexBufferAllocation->mappedPtr; + if (indexData) { + std::memcpy(indexData, indices.data(), static_cast(indexBufferSize)); + } + + // --- 2. Create device-local vertex and index buffers via the memory pool --- + // Add ray tracing flags: eShaderDeviceAddress for vkGetBufferDeviceAddress and + // eAccelerationStructureBuildInputReadOnlyKHR for acceleration structure building + KickWatchdog(); + auto [vertexBuffer, vertexBufferAllocation] = createBufferPooled( + vertexBufferSize, + vk::BufferUsageFlagBits::eTransferDst | vk::BufferUsageFlagBits::eVertexBuffer | + vk::BufferUsageFlagBits::eShaderDeviceAddress | vk::BufferUsageFlagBits::eAccelerationStructureBuildInputReadOnlyKHR, + vk::MemoryPropertyFlagBits::eDeviceLocal); + + KickWatchdog(); + auto [indexBuffer, indexBufferAllocation] = createBufferPooled( + indexBufferSize, + vk::BufferUsageFlagBits::eTransferDst | vk::BufferUsageFlagBits::eIndexBuffer | + vk::BufferUsageFlagBits::eShaderDeviceAddress | vk::BufferUsageFlagBits::eAccelerationStructureBuildInputReadOnlyKHR, + vk::MemoryPropertyFlagBits::eDeviceLocal); + + // --- 3. Either copy now (legacy path) or defer copies for batched submission --- + MeshResources resources; + resources.vertexBuffer = std::move(vertexBuffer); + resources.vertexBufferAllocation = std::move(vertexBufferAllocation); + resources.indexBuffer = std::move(indexBuffer); + resources.indexBufferAllocation = std::move(indexBufferAllocation); + resources.indexCount = static_cast(indices.size()); + + if (deferUpload) { + // Keep staging buffers alive and record their sizes; copies will be + // performed later by preAllocateEntityResourcesBatch(). + resources.stagingVertexBuffer = std::move(stagingVertexBuffer); + resources.stagingVertexBufferAllocation = std::move(stagingVertexBufferAllocation); + resources.vertexBufferSizeBytes = vertexBufferSize; + + resources.stagingIndexBuffer = std::move(stagingIndexBuffer); + resources.stagingIndexBufferAllocation = std::move(stagingIndexBufferAllocation); + resources.indexBufferSizeBytes = indexBufferSize; + } else { + // Immediate upload path used by preAllocateEntityResources() and other + // small-object callers. This preserves existing behaviour. + copyBuffer(stagingVertexBuffer, resources.vertexBuffer, vertexBufferSize); + copyBuffer(stagingIndexBuffer, resources.indexBuffer, indexBufferSize); + // staging* buffers are RAII objects and will be destroyed on scope exit. + } + + // Add to mesh resources map with unique lock + { + std::unique_lock lock(meshResourcesMutex); + meshResources[meshComponent] = std::move(resources); + } + + return true; + } catch (const std::exception& e) { + std::cerr << "Failed to create mesh resources: " << e.what() << std::endl; + return false; + } +} + +// Create uniform buffers +bool Renderer::createUniformBuffers(Entity* entity) { + ensureThreadLocalVulkanInit(); + try { + // Kick watchdog periodically during heavy buffer creation (if called from a long loop) + static uint32_t bufferWatchdogCounter = 0; + if (++bufferWatchdogCounter % 50 == 0) { + lastFrameUpdateTime.store(std::chrono::steady_clock::now(), std::memory_order_relaxed); + } + + // Check if entity resources already exist with shared lock + { + std::shared_lock lock(entityResourcesMutex); + auto it = entityResources.find(entity); + if (it != entityResources.end()) { + return true; + } + } + + // Create entity resources + EntityResources resources; + + // Create uniform buffers using memory pool + vk::DeviceSize bufferSize = sizeof(UniformBufferObject); + for (size_t i = 0; i < MAX_FRAMES_IN_FLIGHT; i++) { + auto [buffer, bufferAllocation] = createBufferPooled( + bufferSize, + vk::BufferUsageFlagBits::eUniformBuffer, + vk::MemoryPropertyFlagBits::eHostVisible | vk::MemoryPropertyFlagBits::eHostCoherent); + + if (!bufferAllocation) { + throw std::runtime_error("Failed to allocate uniform buffer from memory pool"); + } + // Use the memory pool's mapped pointer if available + void* mappedMemory = bufferAllocation->mappedPtr; + if (!mappedMemory) { + std::cerr << "Warning: Uniform buffer allocation is not mapped" << std::endl; + } + + resources.uniformBuffers.emplace_back(std::move(buffer)); + resources.uniformBufferAllocations.emplace_back(std::move(bufferAllocation)); + resources.uniformBuffersMapped.emplace_back(mappedMemory); + } + + // Initialize descriptor initialization tracking flags to MAX_FRAMES_IN_FLIGHT + resources.pbrUboBindingWritten.assign(MAX_FRAMES_IN_FLIGHT, false); + resources.basicUboBindingWritten.assign(MAX_FRAMES_IN_FLIGHT, false); + resources.pbrImagesWritten.assign(MAX_FRAMES_IN_FLIGHT, false); + resources.basicImagesWritten.assign(MAX_FRAMES_IN_FLIGHT, false); + resources.pbrFixedBindingsWritten.assign(MAX_FRAMES_IN_FLIGHT, false); + resources.lastUpdatedFrameBasic.assign(MAX_FRAMES_IN_FLIGHT, 0xFFFFFFFFFFFFFFFFULL); + resources.lastUpdatedFramePBR.assign(MAX_FRAMES_IN_FLIGHT, 0xFFFFFFFFFFFFFFFFULL); + + // Create instance buffer for all entities (shaders always expect instance data) + auto* meshComponent = entity->GetComponent(); + if (meshComponent) { + std::vector instanceData; + + if (meshComponent->GetInstanceCount() > 0) { + // Use existing instance data from GLTF loading (whether 1 or many instances) + instanceData = meshComponent->GetInstances(); + } else { + // Create single instance data using IDENTITY matrix to avoid double-transform with UBO.model + InstanceData singleInstance; + singleInstance.setModelMatrix(glm::mat4(1.0f)); + instanceData = {singleInstance}; + } + + vk::DeviceSize instanceBufferSize = sizeof(InstanceData) * instanceData.size(); + + auto [instanceBuffer, instanceBufferAllocation] = createBufferPooled( + instanceBufferSize, + vk::BufferUsageFlagBits::eVertexBuffer, + vk::MemoryPropertyFlagBits::eHostVisible | vk::MemoryPropertyFlagBits::eHostCoherent); + + if (!instanceBufferAllocation) { + throw std::runtime_error("Failed to allocate instance buffer from memory pool"); + } + // Copy instance data to buffer + void* instanceMappedMemory = instanceBufferAllocation->mappedPtr; + if (instanceMappedMemory) { + std::memcpy(instanceMappedMemory, instanceData.data(), instanceBufferSize); + } else { + std::cerr << "Warning: Instance buffer allocation is not mapped" << std::endl; + } + + resources.instanceBuffer = std::move(instanceBuffer); + resources.instanceBufferAllocation = std::move(instanceBufferAllocation); + resources.instanceBufferMapped = instanceMappedMemory; + } + + // Add to entity resources map with unique lock + { + std::unique_lock lock(entityResourcesMutex); + entityResources[entity] = std::move(resources); + } + + return true; + } catch (const std::exception& e) { + std::cerr << "Failed to create uniform buffers: " << e.what() << std::endl; + return false; + } +} + +// Create descriptor pool +bool Renderer::createDescriptorPool() { + try { + // Calculate pool sizes for all Bistro materials plus additional entities + // The Bistro model creates many more entities than initially expected + // Each entity needs descriptor sets for both basic and PBR pipelines + // PBR pipeline needs 7 descriptors per set (1 UBO + 5 PBR textures + 1 shadow map array with 16 shadow maps) + // Basic pipeline needs 2 descriptors per set (1 UBO + 1 texture) + const uint32_t maxEntities = 20000; // Increased to 20k entities to handle large scenes like Bistro reliably + const uint32_t maxDescriptorSets = MAX_FRAMES_IN_FLIGHT * maxEntities * 2; // 2 pipeline types per entity + + // Calculate descriptor counts + // UBO descriptors: 1 per descriptor set + const uint32_t uboDescriptors = maxDescriptorSets; + // Texture descriptors: Basic pipeline uses 1, PBR uses 21 (5 PBR textures + 16 shadow maps) + // Allocate for worst case: all entities using PBR (21 texture descriptors each) + const uint32_t textureDescriptors = MAX_FRAMES_IN_FLIGHT * maxEntities * 21; + // Storage buffer descriptors: PBR pipeline uses multiple storage buffers per descriptor set. + // Storage buffers used per PBR descriptor set: + // - Binding 6: light storage buffer + // - Binding 7: Forward+ tile headers buffer + // - Binding 8: Forward+ tile indices buffer + // - Binding 9: Fragment debug output buffer (optional) + // - Binding 12: Ray-query geometry info buffer (for raster ray-query shadows) + // - Binding 13: Ray-query material buffer (for raster ray-query shadows) + const uint32_t storageBufferDescriptors = MAX_FRAMES_IN_FLIGHT * maxEntities * 6u; + + // Acceleration structure descriptors: Ray query needs 1 TLAS descriptor per frame + const uint32_t accelerationStructureDescriptors = MAX_FRAMES_IN_FLIGHT; + + // Storage image descriptors: Ray query needs 1 output image descriptor per frame + const uint32_t storageImageDescriptors = MAX_FRAMES_IN_FLIGHT; + + // Reserve extra combined image sampler capacity for Ray Query binding 6 (baseColor texture array) + const uint32_t rqTexDescriptors = MAX_FRAMES_IN_FLIGHT * RQ_MAX_TEX; + std::array poolSizes = { + vk::DescriptorPoolSize{ + .type = vk::DescriptorType::eUniformBuffer, + .descriptorCount = uboDescriptors + }, + vk::DescriptorPoolSize{ + .type = vk::DescriptorType::eCombinedImageSampler, + .descriptorCount = textureDescriptors + rqTexDescriptors + }, + vk::DescriptorPoolSize{ + .type = vk::DescriptorType::eStorageBuffer, + .descriptorCount = storageBufferDescriptors + }, + vk::DescriptorPoolSize{ + .type = vk::DescriptorType::eAccelerationStructureKHR, + .descriptorCount = accelerationStructureDescriptors + }, + vk::DescriptorPoolSize{ + .type = vk::DescriptorType::eStorageImage, + .descriptorCount = storageImageDescriptors + } + }; + + // Create descriptor pool + vk::DescriptorPoolCreateFlags poolFlags = vk::DescriptorPoolCreateFlagBits::eFreeDescriptorSet; + if (descriptorIndexingEnabled) { + poolFlags |= vk::DescriptorPoolCreateFlagBits::eUpdateAfterBind; + } + vk::DescriptorPoolCreateInfo poolInfo{ + .flags = poolFlags, + .maxSets = maxDescriptorSets, + .poolSizeCount = static_cast(poolSizes.size()), + .pPoolSizes = poolSizes.data() + }; + + descriptorPool = vk::raii::DescriptorPool(device, poolInfo); + return true; + } catch (const std::exception& e) { + std::cerr << "Failed to create descriptor pool: " << e.what() << std::endl; + return false; + } +} + +// Create descriptor sets +bool Renderer::createDescriptorSets(Entity* entity, const std::string& texturePath, bool usePBR) { + std::shared_lock lock(entityResourcesMutex); + auto entityIt = entityResources.find(entity); + if (entityIt == entityResources.end()) + return false; + return createDescriptorSets(entity, entityIt->second, texturePath, usePBR); +} + +bool Renderer::createDescriptorSets(Entity* entity, EntityResources& res, const std::string& texturePath, bool usePBR) { + // Kick watchdog periodically during heavy descriptor creation (if called from a long loop) + static uint32_t descWatchdogCounter = 0; + if (++descWatchdogCounter % 50 == 0) { + lastFrameUpdateTime.store(std::chrono::steady_clock::now(), std::memory_order_relaxed); + } + + // Resolve alias before taking the shared lock to avoid nested shared_lock on the same mutex + const std::string resolvedTexturePath = ResolveTextureId(texturePath); + try { + vk::DescriptorSetLayout selectedLayout = usePBR ? *pbrDescriptorSetLayout : *descriptorSetLayout; + std::vector layouts(MAX_FRAMES_IN_FLIGHT, selectedLayout); + vk::DescriptorSetAllocateInfo allocInfo{.descriptorPool = *descriptorPool, .descriptorSetCount = MAX_FRAMES_IN_FLIGHT, .pSetLayouts = layouts.data()}; + + auto& targetDescriptorSets = usePBR ? res.pbrDescriptorSets : res.basicDescriptorSets; + if (targetDescriptorSets.empty()) { + std::lock_guard lk(descriptorMutex); + // Allocate into a temporary owning container, then move the individual RAII sets into our vector. + // (Avoid assigning `vk::raii::DescriptorSets` directly into `std::vector`.) + auto sets = vk::raii::DescriptorSets(device, allocInfo); + targetDescriptorSets.clear(); + targetDescriptorSets.reserve(sets.size()); + for (auto& s : sets) { + targetDescriptorSets.emplace_back(std::move(s)); + } + } + + // Checking validity prevents SIGSEGV crash when Vulkan tries to access invalid handles. + if (targetDescriptorSets.empty() || targetDescriptorSets.size() < MAX_FRAMES_IN_FLIGHT) { + std::cerr << "ERROR: Descriptor set allocation failed for entity " << entity->GetName() + << " (usePBR=" << usePBR << "). Descriptor pool may be exhausted." << std::endl; + return false; + } + + // Only initialize the current frame's descriptor set at runtime to avoid + // updating descriptor sets that may be in use by pending command buffers. + // Other frames will be initialized at their own safe points. + size_t startIndex = static_cast(currentFrame); + size_t endIndex = startIndex + 1; + for (size_t i = startIndex; i < endIndex; i++) { + // Optimization: skip if already updated for this frame slot since the entity was last marked dirty + auto& lastUpdatedVec = usePBR ? res.lastUpdatedFramePBR : res.lastUpdatedFrameBasic; + if (lastUpdatedVec[i] != 0xFFFFFFFFFFFFFFFFULL) { + continue; + } + + // Update the frame tracker BEFORE we might potentially return on error, + // though typically we'd only want to mark success. + // We mark it now so we don't spam errors every frame if it fails once. + lastUpdatedVec[i] = totalFrameCount.load(std::memory_order_relaxed); + + // Validate descriptor set handle before dereferencing to prevent crash + // Check if the underlying VkDescriptorSet handle is valid (not null/default) + vk::DescriptorSet handleCheck = *targetDescriptorSets[i]; + if (handleCheck == vk::DescriptorSet{}) { + std::cerr << "ERROR: Invalid descriptor set handle for entity " << entity->GetName() + << " frame " << i << " (usePBR=" << usePBR << ")" << std::endl; + return false; + } + vk::DescriptorBufferInfo bufferInfo{.buffer = *res.uniformBuffers[i], .range = sizeof(UniformBufferObject)}; + + if (usePBR) { + // Build descriptor writes dynamically to avoid writing unused bindings + std::vector descriptorWrites; + std::array imageInfos; + // Keep additional descriptor infos alive until updateDescriptorSets completes. + vk::DescriptorImageInfo reflInfo; + vk::WriteDescriptorSetAccelerationStructureKHR tlasInfo{}; + vk::AccelerationStructureKHR tlasHandleValue{}; + vk::DescriptorBufferInfo lightBufferInfo; + vk::DescriptorBufferInfo headersInfo; + vk::DescriptorBufferInfo indicesInfo; + + descriptorWrites.push_back({.dstSet = *targetDescriptorSets[i], .dstBinding = 0, .descriptorCount = 1, .descriptorType = vk::DescriptorType::eUniformBuffer, .pBufferInfo = &bufferInfo}); + + auto meshComponent = entity->GetComponent(); + if (meshComponent) { + std::array pbrTexturePaths; + { + const std::string legacyPath = meshComponent->GetTexturePath(); + pbrTexturePaths[0] = (!meshComponent->GetBaseColorTexturePath().empty()) ? meshComponent->GetBaseColorTexturePath() : (!legacyPath.empty() ? legacyPath : SHARED_DEFAULT_ALBEDO_ID); + pbrTexturePaths[1] = (!meshComponent->GetMetallicRoughnessTexturePath().empty()) ? meshComponent->GetMetallicRoughnessTexturePath() : SHARED_DEFAULT_METALLIC_ROUGHNESS_ID; + pbrTexturePaths[2] = (!meshComponent->GetNormalTexturePath().empty()) ? meshComponent->GetNormalTexturePath() : SHARED_DEFAULT_NORMAL_ID; + pbrTexturePaths[3] = (!meshComponent->GetOcclusionTexturePath().empty()) ? meshComponent->GetOcclusionTexturePath() : SHARED_DEFAULT_OCCLUSION_ID; + pbrTexturePaths[4] = (!meshComponent->GetEmissiveTexturePath().empty()) ? meshComponent->GetEmissiveTexturePath() : SHARED_DEFAULT_EMISSIVE_ID; + } + + std::shared_lock texLock(textureResourcesMutex); + for (int j = 0; j < 5; j++) { + const auto resolvedBindingPath = ResolveTextureId(pbrTexturePaths[j]); + auto textureIt = textureResources.find(resolvedBindingPath); + TextureResources* texRes = (textureIt != textureResources.end()) ? &textureIt->second : &defaultTextureResources; + + imageInfos[j] = { + .sampler = *texRes->textureSampler, + .imageView = *texRes->textureImageView, + .imageLayout = vk::ImageLayout::eShaderReadOnlyOptimal + }; + descriptorWrites.push_back({ + .dstSet = *targetDescriptorSets[i], + .dstBinding = static_cast(j + 1), + .descriptorCount = 1, + .descriptorType = vk::DescriptorType::eCombinedImageSampler, + .pImageInfo = &imageInfos[j] + }); + } + } + + lightBufferInfo = vk::DescriptorBufferInfo{.buffer = *lightStorageBuffers[i].buffer, .range = VK_WHOLE_SIZE}; + descriptorWrites.push_back({.dstSet = *targetDescriptorSets[i], .dstBinding = 6, .descriptorCount = 1, .descriptorType = vk::DescriptorType::eStorageBuffer, .pBufferInfo = &lightBufferInfo}); + + // Ensure Forward+ per-frame array exists + if (forwardPlusPerFrame.empty()) { + forwardPlusPerFrame.resize(MAX_FRAMES_IN_FLIGHT); + } + + // Ensure tile headers buffer exists (binding 7) - create minimal dummy if needed + if (i < forwardPlusPerFrame.size()) { + auto& f = forwardPlusPerFrame[i]; + if (!*f.tileHeaders) { + vk::DeviceSize minSize = sizeof(uint32_t) * 4; // Single TileHeader {offset, count, pad0, pad1} + auto [buf, alloc] = createBufferPooled(minSize, + vk::BufferUsageFlagBits::eStorageBuffer, + vk::MemoryPropertyFlagBits::eHostVisible | vk::MemoryPropertyFlagBits::eHostCoherent); + f.tileHeaders = std::move(buf); + f.tileHeadersAlloc = std::move(alloc); + if (!!f.tileHeadersAlloc && f.tileHeadersAlloc->mappedPtr) { + std::memset(f.tileHeadersAlloc->mappedPtr, 0, minSize); + } + } + headersInfo = vk::DescriptorBufferInfo{.buffer = *f.tileHeaders, .offset = 0, .range = VK_WHOLE_SIZE}; + } + + // Ensure tile light indices buffer exists (binding 8) - create minimal dummy if needed + if (i < forwardPlusPerFrame.size()) { + auto& f = forwardPlusPerFrame[i]; + if (!*f.tileLightIndices) { + vk::DeviceSize minSize = sizeof(uint32_t) * 4; // Minimal array of 4 uints + auto [buf, alloc] = createBufferPooled(minSize, + vk::BufferUsageFlagBits::eStorageBuffer, + vk::MemoryPropertyFlagBits::eHostVisible | vk::MemoryPropertyFlagBits::eHostCoherent); + f.tileLightIndices = std::move(buf); + f.tileLightIndicesAlloc = std::move(alloc); + if (!!f.tileLightIndicesAlloc && f.tileLightIndicesAlloc->mappedPtr) { + std::memset(f.tileLightIndicesAlloc->mappedPtr, 0, minSize); + } + } + indicesInfo = vk::DescriptorBufferInfo{.buffer = *f.tileLightIndices, .offset = 0, .range = VK_WHOLE_SIZE}; + } + + // Now both headersInfo and indicesInfo have valid buffers (never nullptr) + descriptorWrites.push_back({.dstSet = *targetDescriptorSets[i], .dstBinding = 7, .descriptorCount = 1, .descriptorType = vk::DescriptorType::eStorageBuffer, .pBufferInfo = &headersInfo}); + descriptorWrites.push_back({.dstSet = *targetDescriptorSets[i], .dstBinding = 8, .descriptorCount = 1, .descriptorType = vk::DescriptorType::eStorageBuffer, .pBufferInfo = &indicesInfo}); + + // Binding 10: reflection sampler (planar reflections) + // Always bind a safe fallback (default texture) so the descriptor is valid. + reflInfo = vk::DescriptorImageInfo{ + .sampler = *defaultTextureResources.textureSampler, + .imageView = *defaultTextureResources.textureImageView, + .imageLayout = vk::ImageLayout::eShaderReadOnlyOptimal + }; + descriptorWrites.push_back({ + .dstSet = *targetDescriptorSets[i], + .dstBinding = 10, + .descriptorCount = 1, + .descriptorType = vk::DescriptorType::eCombinedImageSampler, + .pImageInfo = &reflInfo + }); + + // Binding 11: TLAS (ray-query shadows in raster fragment shader) + // The PBR pipeline layout always declares this binding; it must be written before any draw. + // Bind the current TLAS when AS is enabled. + if (accelerationStructureEnabled) { + vk::AccelerationStructureKHR h = *tlasStructure.handle; + if (!!h) + tlasHandleValue = h; + } + tlasInfo.accelerationStructureCount = 1; + tlasInfo.pAccelerationStructures = &tlasHandleValue; + vk::WriteDescriptorSet tlasWrite{}; + tlasWrite.dstSet = *targetDescriptorSets[i]; + tlasWrite.dstBinding = 11; + tlasWrite.dstArrayElement = 0; + tlasWrite.descriptorCount = 1; + tlasWrite.descriptorType = vk::DescriptorType::eAccelerationStructureKHR; + tlasWrite.pNext = &tlasInfo; + descriptorWrites.push_back(tlasWrite); { + std::lock_guard lk(descriptorMutex); + device.updateDescriptorSets(descriptorWrites, {}); + } + } else { + // Basic Pipeline + // ... (this part remains the same) + vk::Sampler samplerHandle{}; + vk::ImageView viewHandle{}; { + std::shared_lock lock(textureResourcesMutex); + auto textureIt = textureResources.find(resolvedTexturePath); + TextureResources* texRes = (textureIt != textureResources.end()) ? &textureIt->second : &defaultTextureResources; + samplerHandle = *texRes->textureSampler; + viewHandle = *texRes->textureImageView; + } + vk::DescriptorImageInfo imageInfo{.sampler = samplerHandle, .imageView = viewHandle, .imageLayout = vk::ImageLayout::eShaderReadOnlyOptimal}; + std::array descriptorWrites = { + vk::WriteDescriptorSet{.dstSet = *targetDescriptorSets[i], .dstBinding = 0, .descriptorCount = 1, .descriptorType = vk::DescriptorType::eUniformBuffer, .pBufferInfo = &bufferInfo}, + vk::WriteDescriptorSet{.dstSet = *targetDescriptorSets[i], .dstBinding = 1, .descriptorCount = 1, .descriptorType = vk::DescriptorType::eCombinedImageSampler, .pImageInfo = &imageInfo} + }; { + std::lock_guard lk(descriptorMutex); + device.updateDescriptorSets(descriptorWrites, {}); + } + } + } + return true; + } catch (const std::exception& e) { + std::cerr << "Failed to create descriptor sets for " << entity->GetName() << ": " << e.what() << std::endl; + return false; + } +} + +// Pre-allocate all Vulkan resources for an entity during scene loading +bool Renderer::preAllocateEntityResources(Entity* entity) { + try { + // Get the mesh component + auto meshComponent = entity->GetComponent(); + if (!meshComponent) { + std::cerr << "Entity " << entity->GetName() << " has no mesh component" << std::endl; + return false; + } + + // Ensure local AABB is available for debug/probes + meshComponent->RecomputeLocalAABB(); + + // 1. Create mesh resources (vertex/index buffers) + if (!createMeshResources(meshComponent)) { + std::cerr << "Failed to create mesh resources for entity: " << entity->GetName() << std::endl; + return false; + } + + // 2. Create uniform buffers + if (!createUniformBuffers(entity)) { + std::cerr << "Failed to create uniform buffers for entity: " << entity->GetName() << std::endl; + return false; + } + + // Initialize per-frame UBO and image binding write flags + { + std::unique_lock lock(entityResourcesMutex); + auto it = entityResources.find(entity); + if (it != entityResources.end()) { + it->second.pbrUboBindingWritten.assign(MAX_FRAMES_IN_FLIGHT, false); + it->second.basicUboBindingWritten.assign(MAX_FRAMES_IN_FLIGHT, false); + it->second.pbrImagesWritten.assign(MAX_FRAMES_IN_FLIGHT, false); + it->second.basicImagesWritten.assign(MAX_FRAMES_IN_FLIGHT, false); + it->second.pbrFixedBindingsWritten.assign(MAX_FRAMES_IN_FLIGHT, false); + it->second.lastUpdatedFrameBasic.assign(MAX_FRAMES_IN_FLIGHT, 0xFFFFFFFFFFFFFFFFULL); + it->second.lastUpdatedFramePBR.assign(MAX_FRAMES_IN_FLIGHT, 0xFFFFFFFFFFFFFFFFULL); + } + } + + // 3. Pre-allocate BOTH basic and PBR descriptor sets + std::string texturePath = meshComponent->GetTexturePath(); + // Fallback: if legacy texturePath is empty, use PBR baseColor texture + if (texturePath.empty()) { + const std::string& baseColor = meshComponent->GetBaseColorTexturePath(); + if (!baseColor.empty()) { + texturePath = baseColor; + } + } + + // Create basic descriptor sets + if (!createDescriptorSets(entity, texturePath, false)) { + std::cerr << "Failed to create basic descriptor sets for entity: " << entity->GetName() << std::endl; + return false; + } + + // Create PBR descriptor sets + if (!createDescriptorSets(entity, texturePath, true)) { + std::cerr << "Failed to create PBR descriptor sets for entity: " << entity->GetName() << std::endl; + return false; + } + return true; + } catch (const std::exception& e) { + std::cerr << "Failed to pre-allocate resources for entity " << entity->GetName() << ": " << e.what() << std::endl; + return false; + } +} + +// Pre-allocate Vulkan resources for a batch of entities, batching mesh uploads +bool Renderer::preAllocateEntityResourcesBatch(const std::vector& entities) { + watchdogProgressLabel.store("Batch: ensureThreadLocalVulkanInit", std::memory_order_relaxed); + watchdogProgressIndex.store(0, std::memory_order_relaxed); + ensureThreadLocalVulkanInit(); + try { + // --- 1. For all entities, create mesh resources with deferred uploads --- + // Then, during initial loading (and while an AS build is pending), flush the queued + // uploads immediately in a single batched submit (much faster than per-mesh submits). + watchdogProgressLabel.store("Batch: createMeshResources loop", std::memory_order_relaxed); + std::vector meshesNeedingUpload; + meshesNeedingUpload.reserve(entities.size()); + const bool flushUploadsNow = IsLoading() || asBuildRequested.load(std::memory_order_relaxed); + + uint32_t processedMeshes = 0; + uint32_t meshLoopIndex = 0; + bool anyFailure = false; + for (Entity* entity : entities) { + watchdogProgressIndex.store(meshLoopIndex++, std::memory_order_relaxed); + + if (!entity) { + continue; + } + + // Kick watchdog only occasionally during heavy mesh resource creation + if (++processedMeshes % 20 == 0) { + KickWatchdog(); + } + + auto meshComponent = entity->GetComponent(); + if (!meshComponent) { + continue; + } + + watchdogProgressLabel.store("Batch: createMeshResources", std::memory_order_relaxed); + if (!createMeshResources(meshComponent, /*deferUpload=*/true)) { + std::cerr << "Failed to create mesh resources for entity (batch): " + << entity->GetName() << std::endl; + anyFailure = true; + continue; + } + + { + std::shared_lock lock(meshResourcesMutex); + auto it = meshResources.find(meshComponent); + if (it == meshResources.end()) { + continue; + } + MeshResources& res = it->second; + + // Only schedule meshes that still have staged data pending upload + if (res.vertexBufferSizeBytes > 0 || res.indexBufferSizeBytes > 0) { + meshesNeedingUpload.push_back(meshComponent); + } + } + } + + // --- 2. Defer all GPU copies to the render thread safe point --- + if (!meshesNeedingUpload.empty()) + { + watchdogProgressLabel.store("Batch: EnqueueMeshUploads", std::memory_order_relaxed); + EnqueueMeshUploads(meshesNeedingUpload); + if (flushUploadsNow) { + // We used to call ProcessPendingMeshUploads() here, but it's redundant and slow + // because Render() calls it once per frame already. + } + } + + // --- 3. Create uniform buffers and descriptor sets per entity --- + watchdogProgressLabel.store("Batch: per-entity resources loop", std::memory_order_relaxed); + uint32_t processedResources = 0; + uint32_t resourceLoopIndex = 0; + for (Entity* entity : entities) { + watchdogProgressIndex.store(resourceLoopIndex++, std::memory_order_relaxed); + + if (!entity) { + continue; + } + + // Kick watchdog only occasionally during heavy resource creation + if (++processedResources % 20 == 0) { + KickWatchdog(); + } + + auto meshComponent = entity->GetComponent(); + if (!meshComponent) { + continue; + } + + watchdogProgressLabel.store("Batch: createUniformBuffers", std::memory_order_relaxed); + if (!createUniformBuffers(entity)) { + std::cerr << "Failed to create uniform buffers for entity (batch): " + << entity->GetName() << std::endl; + anyFailure = true; + continue; + } + + std::string texturePath = meshComponent->GetTexturePath(); + // Fallback: if legacy texturePath is empty, use PBR baseColor texture + if (texturePath.empty()) { + const std::string& baseColor = meshComponent->GetBaseColorTexturePath(); + if (!baseColor.empty()) { + texturePath = baseColor; + } + } + + // Optimization: Only create the descriptor sets that will actually be used. + // In the Bistro scene and most modern GLTF loads, we use PBR by default. + // We skip basic descriptor sets unless the entity is explicitly non-PBR. + bool usePBR = true; + + if (usePBR) { + watchdogProgressLabel.store("Batch: createDescriptorSets (pbr)", std::memory_order_relaxed); + if (!createDescriptorSets(entity, texturePath, true)) { + std::cerr << "Failed to create PBR descriptor sets for entity (batch): " + << entity->GetName() << std::endl; + anyFailure = true; + continue; + } + } else { + watchdogProgressLabel.store("Batch: createDescriptorSets (basic)", std::memory_order_relaxed); + if (!createDescriptorSets(entity, texturePath, false)) { + std::cerr << "Failed to create basic descriptor sets for entity (batch): " + << entity->GetName() << std::endl; + anyFailure = true; + continue; + } + } + } + + return !anyFailure; + } catch (const std::exception& e) { + std::cerr << "Failed to batch pre-allocate resources for entities: " << e.what() << std::endl; + return false; + } +} + +// Enqueue a set of meshes to upload on the render thread (safe point) +void Renderer::EnqueueMeshUploads(const std::vector& meshes) { + if (meshes.empty()) + return; + std::lock_guard lk(pendingMeshUploadsMutex); + // Avoid duplicates by using a temporary set of current entries + for (MeshComponent* m : meshes) { + if (!m) + continue; + pendingMeshUploads.push_back(m); + } +} + +void Renderer::EnqueueEntityPreallocationBatch(const std::vector& entities) { + if (entities.empty()) + return; { + std::lock_guard lk(pendingEntityPreallocMutex); + for (Entity* e : entities) { + if (!e) + continue; + pendingEntityPrealloc.push_back(e); + } + } + pendingEntityPreallocQueued.store(true, std::memory_order_relaxed); +} + +void Renderer::EnqueueInstanceBufferRecreation(Entity* entity) { + if (!entity) + return; { + std::lock_guard lk(pendingEntityPreallocMutex); + pendingInstanceBufferRecreations.push_back(entity); + } + pendingEntityPreallocQueued.store(true, std::memory_order_relaxed); +} + +void Renderer::ProcessPendingEntityPreallocations() { + if (!pendingEntityPreallocQueued.load(std::memory_order_relaxed)) + return; + + // Suppress watchdog during heavy preallocation batches + ScopedWatchdogSuppression watchdogGuard(this); + watchdogProgressLabel.store("Prealloc: drain queues", std::memory_order_relaxed); + + std::vector toPreallocate; + std::vector toRecreateInstances; { + std::lock_guard lk(pendingEntityPreallocMutex); + if (pendingEntityPrealloc.empty() && pendingInstanceBufferRecreations.empty()) { + pendingEntityPreallocQueued.store(false, std::memory_order_relaxed); + return; + } + // Time-slice preallocations to avoid blocking the render thread for huge scenes. + // Process a larger chunk while loading to speed up startup, and a smaller chunk + // during normal gameplay to maintain high FPS. + const size_t preallocChunkSize = IsLoading() ? 100 : 1; + if (!pendingEntityPrealloc.empty()) { + size_t count = std::min(pendingEntityPrealloc.size(), preallocChunkSize); + toPreallocate.insert(toPreallocate.end(), pendingEntityPrealloc.begin(), pendingEntityPrealloc.begin() + count); + pendingEntityPrealloc.erase(pendingEntityPrealloc.begin(), pendingEntityPrealloc.begin() + count); + } + + // Always process all instance recreations as they are usually fewer and more critical + toRecreateInstances.swap(pendingInstanceBufferRecreations); + + if (pendingEntityPrealloc.empty() && pendingInstanceBufferRecreations.empty()) { + pendingEntityPreallocQueued.store(false, std::memory_order_relaxed); + } + } + + // De-dup preallocations + watchdogProgressLabel.store("Prealloc: dedup", std::memory_order_relaxed); + std::sort(toPreallocate.begin(), toPreallocate.end()); + toPreallocate.erase(std::unique(toPreallocate.begin(), toPreallocate.end()), toPreallocate.end()); + + std::vector batch; + batch.reserve(toPreallocate.size()); + for (Entity* e : toPreallocate) { + if (!e || !e->IsActive()) + continue; + if (!e->GetComponent()) + continue; + batch.push_back(e); + } + + if (!batch.empty()) { + static uint64_t batchCount = 0; + batchCount++; + auto start = std::chrono::steady_clock::now(); + watchdogProgressLabel.store("Prealloc: preAllocateEntityResourcesBatch", std::memory_order_relaxed); + if (!preAllocateEntityResourcesBatch(batch)) { + std::cerr << "Warning: batch entity GPU preallocation failed; some entities may be missing or retried later" << std::endl; + // Note: we don't put them back at the head of the queue to avoid infinite loops on invalid data, + // but they remain in the scene and might be retried if their components are refreshed. + // For now, most failures are fatal per-entity but shouldn't crash the renderer. + } + CompleteLoadingWorkItems(batch.size()); + } + + // Process instance buffer recreations. + // Wait for GPU idle ONCE before processing the batch to safely destroy old buffers. + if (!toRecreateInstances.empty()) { + watchdogProgressLabel.store("Prealloc: wait other inFlightFences (before recreateInstanceBuffer)", std::memory_order_relaxed); + // IMPORTANT: We are called from the render thread at the frame-start safe point, + // *after* `inFlightFences[currentFrame]` was waited and then reset. + // Waiting on the current frame fence here would deadlock forever because it won't be + // signaled until we submit the current frame (which can't happen while we're blocked). + std::vector fencesToWait; + watchdogProgressLabel.store("Prealloc: recreateInstanceBuffer loop", std::memory_order_relaxed); + uint32_t processed = 0; + for (Entity* e : toRecreateInstances) { + if (!e || !e->IsActive()) + continue; + + // Kick watchdog periodically during heavy batch processing + if (++processed % 10 == 0) { + lastFrameUpdateTime.store(std::chrono::steady_clock::now(), std::memory_order_relaxed); + } + + if (!recreateInstanceBuffer(e)) { + std::cerr << "Warning: failed to recreate instance buffer for entity " << e->GetName() << std::endl; + } + } + } + + watchdogProgressLabel.store("Prealloc: done", std::memory_order_relaxed); +} + +// Execute pending mesh uploads on the render thread after the per-frame fence wait +void Renderer::ProcessPendingMeshUploads() { + try { + // 0. Retire completed async upload batches (if timeline semaphore is available) + if (!!*uploadsTimeline) { + bool anyCompleted = false; + while (true) { + InFlightMeshUploadBatch completedBatch; + uint64_t batchValue = 0; { + std::lock_guard lk(inFlightMeshUploadsMutex); + if (inFlightMeshUploads.empty()) + break; + if (inFlightMeshUploads.front().signalValue == 0) + break; + batchValue = inFlightMeshUploads.front().signalValue; + + // Use waitSemaphores with 0 timeout to poll for completion + vk::Result waitResult = vk::Result::eTimeout; + uint64_t currentCounter = 0; + try { + currentCounter = uploadsTimeline.getCounterValue(); + if (currentCounter >= batchValue) { + waitResult = vk::Result::eSuccess; + } else { + // Also try polling with 0 timeout just in case getCounterValue is stale + vk::SemaphoreWaitInfo waitInfo{ + .semaphoreCount = 1, + .pSemaphores = &*uploadsTimeline, + .pValues = &batchValue + }; + waitResult = device.waitSemaphores(waitInfo, 0); + } + } catch (...) {} + + if (waitResult != vk::Result::eSuccess) { + break; // not finished yet + } + + // Now safe to retire the batch + completedBatch = std::move(inFlightMeshUploads.front()); + inFlightMeshUploads.pop_front(); + + // Double check synchronization: ensure the semaphore counter is indeed at least batchValue + currentCounter = uploadsTimeline.getCounterValue(); + if (currentCounter < batchValue) { + std::cerr << "CRITICAL: waitSemaphores returned Success but getCounterValue is still " + << currentCounter << " (expected >= " << batchValue << ")" << std::endl; + // Push it back and break? No, if waitSemaphores lied, we are in trouble anyway. + } + } + + if (completedBatch.commandPool) { + // Lock meshResources exclusively to modify staging buffer states + std::unique_lock meshResourcesLock(meshResourcesMutex); + // Clear staging once copies are complete + for (auto* meshComponent : completedBatch.meshes) { + if (!meshComponent) continue; + auto it = meshResources.find(meshComponent); + if (it == meshResources.end()) + continue; + MeshResources& res = it->second; + // Accessing members of res (offset 8 for vertexBufferAllocation unique_ptr) + res.stagingVertexBuffer = vk::raii::Buffer(nullptr); + res.stagingVertexBufferAllocation = nullptr; + res.vertexBufferSizeBytes = 0; + res.stagingIndexBuffer = vk::raii::Buffer(nullptr); + res.stagingIndexBufferAllocation = nullptr; + res.indexBufferSizeBytes = 0; + } + anyCompleted = true; + + // Explicitly destroy command buffers and pool while we are here, + // though RAII will do it when completedBatch goes out of scope. + // This allows us to catch any exceptions during destruction. + try { + completedBatch.commandBuffers = nullptr; + completedBatch.commandPool = nullptr; + } catch (const std::exception& ex) { + std::cerr << "Exception during batch retirement: " << ex.what() << std::endl; + } + } + } + + if (anyCompleted && !IsLoading()) { + asDevOverrideAllowRebuild = true; + RequestAccelerationStructureBuild("uploads completed"); + } + } + + // Grab the list atomically + std::vector toProcess; { + std::lock_guard lk(pendingMeshUploadsMutex); + if (pendingMeshUploads.empty()) + return; + toProcess.swap(pendingMeshUploads); + } + + // Build a quick lookup of meshes already in flight so we don't submit duplicate copies + std::unordered_set inFlightMeshes; { + std::lock_guard lk(inFlightMeshUploadsMutex); + for (const auto& b : inFlightMeshUploads) { + for (auto* m : b.meshes) { + inFlightMeshes.insert(m); + } + } + } + + // Filter to meshes that still have staged data + std::vector needsCopy; + needsCopy.reserve(toProcess.size()); + for (auto* meshComponent : toProcess) { + if (inFlightMeshes.find(meshComponent) != inFlightMeshes.end()) + continue; + auto it = meshResources.find(meshComponent); + if (it == meshResources.end()) + continue; + const MeshResources& res = it->second; + if (res.vertexBufferSizeBytes > 0 || res.indexBufferSizeBytes > 0) { + needsCopy.push_back(meshComponent); + } + } + + if (needsCopy.empty()) + return; + + // Record copies on GRAPHICS queue + vk::CommandPoolCreateInfo poolInfo{ + .flags = vk::CommandPoolCreateFlagBits::eTransient | vk::CommandPoolCreateFlagBits::eResetCommandBuffer, + .queueFamilyIndex = queueFamilyIndices.graphicsFamily.value() + }; + + // Prefer async submission via the uploads timeline semaphore. + // ONLY force synchronous if AS build is ALREADY requested AND we have no timeline. + // Doing synchronous uploads during loading hangs the main thread for large scenes. + const bool forceSynchronous = asBuildRequested.load(std::memory_order_relaxed) && + (!*uploadsTimeline || *uploadsTimeline == vk::Semaphore{}); + const bool canSignalTimeline = (!!*uploadsTimeline && *uploadsTimeline != vk::Semaphore{}) && !forceSynchronous; + + if (canSignalTimeline) { + auto tempPool = std::make_unique(device, poolInfo); + vk::CommandBufferAllocateInfo allocInfo{ + .commandPool = **tempPool, + .level = vk::CommandBufferLevel::ePrimary, + .commandBufferCount = 1 + }; + auto cbs = std::make_unique(device, allocInfo); + vk::raii::CommandBuffer& cb = (*cbs)[0]; + cb.begin({.flags = vk::CommandBufferUsageFlagBits::eOneTimeSubmit}); + + uint32_t processed = 0; + for (auto* meshComponent : needsCopy) { + if (++processed % 5 == 0) { + KickWatchdog(); + } + MeshResources* pRes = nullptr; { + std::shared_lock meshLock(meshResourcesMutex); + auto it = meshResources.find(meshComponent); + if (it != meshResources.end()) pRes = &it->second; + } + if (!pRes) continue; + MeshResources& res = *pRes; + if (res.vertexBufferSizeBytes > 0 && !!*res.stagingVertexBuffer && !!*res.vertexBuffer) { + vk::BufferCopy region{.srcOffset = 0, .dstOffset = 0, .size = res.vertexBufferSizeBytes}; + cb.copyBuffer(*res.stagingVertexBuffer, *res.vertexBuffer, region); + } + if (res.indexBufferSizeBytes > 0 && !!*res.stagingIndexBuffer && !!*res.indexBuffer) { + vk::BufferCopy region{.srcOffset = 0, .dstOffset = 0, .size = res.indexBufferSizeBytes}; + cb.copyBuffer(*res.stagingIndexBuffer, *res.indexBuffer, region); + } + } + cb.end(); + + uint64_t signalVal = 0; + SubmitToQueue2(*graphicsQueue, *cb, true, &signalVal, nullptr); + + InFlightMeshUploadBatch batch; + batch.signalValue = signalVal; + batch.meshes = std::move(needsCopy); + batch.commandPool = std::move(tempPool); + batch.commandBuffers = std::move(cbs); { + std::lock_guard lk(inFlightMeshUploadsMutex); + inFlightMeshUploads.push_back(std::move(batch)); + } + } else { + // Fallback: submit and wait (limit batch size to avoid long frame hangs) + const size_t batchLimit = 50; + for (size_t i = 0; i < needsCopy.size(); i += batchLimit) { + size_t end = std::min(i + batchLimit, needsCopy.size()); + std::vector batch(needsCopy.begin() + i, needsCopy.begin() + end); + + vk::raii::CommandPool tempPool(device, poolInfo); + vk::CommandBufferAllocateInfo allocInfo{ + .commandPool = *tempPool, + .level = vk::CommandBufferLevel::ePrimary, + .commandBufferCount = 1 + }; + vk::raii::CommandBuffers cbs(device, allocInfo); + vk::raii::CommandBuffer& cb = cbs[0]; + cb.begin({.flags = vk::CommandBufferUsageFlagBits::eOneTimeSubmit}); + + for (auto* meshComponent : batch) { + MeshResources* pRes = nullptr; { + std::shared_lock meshLock(meshResourcesMutex); + auto it = meshResources.find(meshComponent); + if (it != meshResources.end()) pRes = &it->second; + } + if (!pRes) continue; + MeshResources& res = *pRes; + if (res.vertexBufferSizeBytes > 0 && !!*res.stagingVertexBuffer && !!*res.vertexBuffer) { + vk::BufferCopy region{.srcOffset = 0, .dstOffset = 0, .size = res.vertexBufferSizeBytes}; + cb.copyBuffer(*res.stagingVertexBuffer, *res.vertexBuffer, region); + } + if (res.indexBufferSizeBytes > 0 && !!*res.stagingIndexBuffer && !!*res.indexBuffer) { + vk::BufferCopy region{.srcOffset = 0, .dstOffset = 0, .size = res.indexBufferSizeBytes}; + cb.copyBuffer(*res.stagingIndexBuffer, *res.indexBuffer, region); + } + } + cb.end(); + + vk::raii::Fence fence(device, vk::FenceCreateInfo{}); + SubmitToQueue2(*graphicsQueue, *cb, false, nullptr, *fence); + (void) waitForFencesSafe(*fence, VK_TRUE); + + for (auto* meshComponent : batch) { + auto it = meshResources.find(meshComponent); + if (it == meshResources.end()) continue; + MeshResources& res = it->second; + res.stagingVertexBuffer = vk::raii::Buffer(nullptr); + res.stagingVertexBufferAllocation = nullptr; + res.vertexBufferSizeBytes = 0; + res.stagingIndexBuffer = vk::raii::Buffer(nullptr); + res.stagingIndexBufferAllocation = nullptr; + res.indexBufferSizeBytes = 0; + } + // Update watchdog periodically + lastFrameUpdateTime.store(std::chrono::steady_clock::now(), std::memory_order_relaxed); + } + // Rebuild AS after uploads only if we are in Play state to avoid loading stutter. + // Initial AS build is triggered separately when preallocation finishes. + if (!IsLoading()) { + asDevOverrideAllowRebuild = true; + RequestAccelerationStructureBuild("uploads completed"); + } + } + } catch (const std::exception& e) { + std::cerr << "CRITICAL: ProcessPendingMeshUploads error: " << e.what() << std::endl; + } +} + +// Recreate instance buffer for an entity (e.g., after clearing instances for animation) +bool Renderer::recreateInstanceBuffer(Entity* entity) { + ensureThreadLocalVulkanInit(); + try { + // Find the entity in entityResources + auto it = entityResources.find(entity); + if (it == entityResources.end()) { + std::cerr << "Entity " << entity->GetName() << " not found in entityResources" << std::endl; + return false; + } + + EntityResources& resources = it->second; + + // Create a single instance with identity matrix + InstanceData singleInstance; + singleInstance.setModelMatrix(glm::mat4(1.0f)); + std::vector instanceData = {singleInstance}; + + vk::DeviceSize instanceBufferSize = sizeof(InstanceData) * instanceData.size(); + + // Create new instance buffer using memory pool + auto [instanceBuffer, instanceBufferAllocation] = createBufferPooled( + instanceBufferSize, + vk::BufferUsageFlagBits::eVertexBuffer, + vk::MemoryPropertyFlagBits::eHostVisible | vk::MemoryPropertyFlagBits::eHostCoherent); + + if (!instanceBufferAllocation) { + throw std::runtime_error("Failed to allocate instance buffer from memory pool"); + } + // Copy instance data to buffer + void* instanceMappedMemory = instanceBufferAllocation->mappedPtr; + if (instanceMappedMemory) { + std::memcpy(instanceMappedMemory, instanceData.data(), instanceBufferSize); + } else { + std::cerr << "Warning: Instance buffer allocation is not mapped" << std::endl; + } + + // Replace the old instance buffer with the new one. + // Note: Caller must ensure GPU is idle before this method is called to safely destroy the old buffer. + resources.instanceBuffer = std::move(instanceBuffer); + resources.instanceBufferAllocation = std::move(instanceBufferAllocation); + resources.instanceBufferMapped = instanceMappedMemory; + + std::cout << "[Animation] Recreated instance buffer for entity '" << entity->GetName() + << "' with single identity instance" << std::endl; + + return true; + } catch (const std::exception& e) { + std::cerr << "Failed to recreate instance buffer for entity " << entity->GetName() + << ": " << e.what() << std::endl; + return false; + } +} + +// Create buffer using memory pool for efficient allocation +std::pair> Renderer::createBufferPooled( + vk::DeviceSize size, + vk::BufferUsageFlags usage, + vk::MemoryPropertyFlags properties) { + try { + if (!memoryPool) { + throw std::runtime_error("Memory pool not initialized"); + } + + // Use memory pool for allocation + auto [buffer, allocation] = memoryPool->createBuffer(size, usage, properties); + + return {std::move(buffer), std::move(allocation)}; + } catch (const std::exception& e) { + std::cerr << "Failed to create buffer with memory pool: " << e.what() << std::endl; + throw; + } +} + +// Legacy createBuffer function - now strictly enforces memory pool usage +std::pair Renderer::createBuffer( + vk::DeviceSize size, + vk::BufferUsageFlags usage, + vk::MemoryPropertyFlags properties) { + // This function should only be used for temporary staging buffers during resource creation + // All persistent resources should use createBufferPooled directly + + if (!memoryPool) { + throw std::runtime_error("Memory pool not available - cannot create buffer"); + } + + // Only allow direct allocation for staging buffers (temporary, host-visible) + if (!(properties & vk::MemoryPropertyFlagBits::eHostVisible)) { + std::cerr << "ERROR: Legacy createBuffer should only be used for staging buffers!" << std::endl; + throw std::runtime_error("Legacy createBuffer used for non-staging buffer"); + } + + try { + vk::BufferCreateInfo bufferInfo{ + .size = size, + .usage = usage, + .sharingMode = vk::SharingMode::eExclusive + }; + + vk::raii::Buffer buffer(device, bufferInfo); + + // Allocate memory directly for staging buffers only + vk::MemoryRequirements memRequirements = buffer.getMemoryRequirements(); + + // Align allocation size to nonCoherentAtomSize (64 bytes) to prevent validation errors + // VUID-VkMappedMemoryRange-size-01390 requires memory flush sizes to be multiples of nonCoherentAtomSize + const vk::DeviceSize nonCoherentAtomSize = 64; // Typical value, should query from device properties + vk::DeviceSize alignedSize = ((memRequirements.size + nonCoherentAtomSize - 1) / nonCoherentAtomSize) * nonCoherentAtomSize; + + vk::MemoryAllocateInfo allocInfo{ + .allocationSize = alignedSize, + .memoryTypeIndex = findMemoryType(memRequirements.memoryTypeBits, properties) + }; + + // Add allocation flags if requested (e.g. for buffers needing device address) + vk::MemoryAllocateFlagsInfo flagsInfo{}; + if (usage & vk::BufferUsageFlagBits::eShaderDeviceAddress) { + flagsInfo.flags = vk::MemoryAllocateFlagBits::eDeviceAddress; + allocInfo.pNext = &flagsInfo; + } + + vk::raii::DeviceMemory bufferMemory(device, allocInfo); + + // Bind memory to buffer + buffer.bindMemory(*bufferMemory, 0); + + return {std::move(buffer), std::move(bufferMemory)}; + } catch (const std::exception& e) { + std::cerr << "Failed to create staging buffer: " << e.what() << std::endl; + throw; + } +} + +void Renderer::createTransparentDescriptorSets() { + // We need one descriptor set per frame in flight for this resource + std::vector layouts(MAX_FRAMES_IN_FLIGHT, *transparentDescriptorSetLayout); + vk::DescriptorSetAllocateInfo allocInfo{ + .descriptorPool = *descriptorPool, + .descriptorSetCount = static_cast(MAX_FRAMES_IN_FLIGHT), + .pSetLayouts = layouts.data() + }; { + // Serialize allocation vs other descriptor ops + std::lock_guard lk(descriptorMutex); + transparentDescriptorSets = vk::raii::DescriptorSets(device, allocInfo); + } + + // Update each descriptor set to point to the per-frame off-screen opaque color image + for (size_t i = 0; i < MAX_FRAMES_IN_FLIGHT; i++) { + vk::DescriptorImageInfo imageInfo{ + .sampler = *opaqueSceneColorSampler, + .imageView = *opaqueSceneColorImageViews[i], + .imageLayout = vk::ImageLayout::eShaderReadOnlyOptimal + }; + + vk::WriteDescriptorSet descriptorWrite{ + .dstSet = *transparentDescriptorSets[i], + .dstBinding = 0, // Binding 0 in Set 1 + .descriptorCount = 1, + .descriptorType = vk::DescriptorType::eCombinedImageSampler, + .pImageInfo = &imageInfo + }; { + std::lock_guard lk(descriptorMutex); + device.updateDescriptorSets(descriptorWrite, nullptr); + } + } +} + +void Renderer::createTransparentFallbackDescriptorSets() { + // Allocate one descriptor set per frame in flight using the same layout (single combined image sampler at binding 0) + std::vector layouts(MAX_FRAMES_IN_FLIGHT, *transparentDescriptorSetLayout); + vk::DescriptorSetAllocateInfo allocInfo{ + .descriptorPool = *descriptorPool, + .descriptorSetCount = static_cast(MAX_FRAMES_IN_FLIGHT), + .pSetLayouts = layouts.data() + }; { + std::lock_guard lk(descriptorMutex); + transparentFallbackDescriptorSets = vk::raii::DescriptorSets(device, allocInfo); + } + + // Point each set to the default texture, which is guaranteed to be in SHADER_READ_ONLY_OPTIMAL when used in the opaque pass + for (size_t i = 0; i < MAX_FRAMES_IN_FLIGHT; i++) { + vk::DescriptorImageInfo imageInfo{ + .sampler = *defaultTextureResources.textureSampler, + .imageView = *defaultTextureResources.textureImageView, + .imageLayout = vk::ImageLayout::eShaderReadOnlyOptimal + }; + + vk::WriteDescriptorSet descriptorWrite{ + .dstSet = *transparentFallbackDescriptorSets[i], + .dstBinding = 0, + .descriptorCount = 1, + .descriptorType = vk::DescriptorType::eCombinedImageSampler, + .pImageInfo = &imageInfo + }; { + std::lock_guard lk(descriptorMutex); + device.updateDescriptorSets(descriptorWrite, nullptr); + } + } +} + +bool Renderer::createOpaqueSceneColorResources() { + try { + opaqueSceneColorImages.clear(); + opaqueSceneColorImageAllocations.clear(); + opaqueSceneColorImageViews.clear(); + opaqueSceneColorImageLayouts.clear(); + + opaqueSceneColorImages.reserve(MAX_FRAMES_IN_FLIGHT); + opaqueSceneColorImageAllocations.reserve(MAX_FRAMES_IN_FLIGHT); + opaqueSceneColorImageViews.reserve(MAX_FRAMES_IN_FLIGHT); + opaqueSceneColorImageLayouts.reserve(MAX_FRAMES_IN_FLIGHT); + + for (uint32_t i = 0; i < MAX_FRAMES_IN_FLIGHT; ++i) { + auto [image, allocation] = createImagePooled( + swapChainExtent.width, + swapChainExtent.height, + swapChainImageFormat, + // Use the same format as the swapchain + vk::ImageTiling::eOptimal, + vk::ImageUsageFlagBits::eColorAttachment | vk::ImageUsageFlagBits::eSampled | vk::ImageUsageFlagBits::eTransferSrc, + vk::MemoryPropertyFlagBits::eDeviceLocal); + + opaqueSceneColorImages.push_back(std::move(image)); + opaqueSceneColorImageAllocations.push_back(std::move(allocation)); + opaqueSceneColorImageViews.push_back(createImageView(opaqueSceneColorImages.back(), swapChainImageFormat, vk::ImageAspectFlagBits::eColor)); + opaqueSceneColorImageLayouts.push_back(vk::ImageLayout::eUndefined); + } + + // Create (or recreate) the sampler (shared across frames) + vk::SamplerCreateInfo samplerInfo{ + .magFilter = vk::Filter::eLinear, + .minFilter = vk::Filter::eLinear, + .addressModeU = vk::SamplerAddressMode::eClampToEdge, + .addressModeV = vk::SamplerAddressMode::eClampToEdge, + .addressModeW = vk::SamplerAddressMode::eClampToEdge, + }; + opaqueSceneColorSampler = vk::raii::Sampler(device, samplerInfo); + return true; + } catch (const std::exception& e) { + std::cerr << "Failed to create opaque scene color resources: " << e.what() << std::endl; + return false; + } +} + +// Copy buffer +void Renderer::copyBuffer(vk::raii::Buffer& srcBuffer, vk::raii::Buffer& dstBuffer, vk::DeviceSize size) { + ensureThreadLocalVulkanInit(); + try { + // Create a temporary transient command pool and command buffer to isolate per-thread usage (transfer family) + vk::CommandPoolCreateInfo poolInfo{ + .flags = vk::CommandPoolCreateFlagBits::eTransient | vk::CommandPoolCreateFlagBits::eResetCommandBuffer, + .queueFamilyIndex = queueFamilyIndices.transferFamily.value() + }; + vk::raii::CommandPool tempPool(device, poolInfo); + vk::CommandBufferAllocateInfo allocInfo{ + .commandPool = *tempPool, + .level = vk::CommandBufferLevel::ePrimary, + .commandBufferCount = 1 + }; + + vk::raii::CommandBuffers commandBuffers(device, allocInfo); + vk::raii::CommandBuffer& commandBuffer = commandBuffers[0]; + + // Begin command buffer + vk::CommandBufferBeginInfo beginInfo{ + .flags = vk::CommandBufferUsageFlagBits::eOneTimeSubmit + }; + + commandBuffer.begin(beginInfo); + + // Copy buffer + vk::BufferCopy copyRegion{ + .srcOffset = 0, + .dstOffset = 0, + .size = size + }; + + commandBuffer.copyBuffer(*srcBuffer, *dstBuffer, copyRegion); + + // End command buffer + commandBuffer.end(); + + // Submit command buffer + vk::SubmitInfo submitInfo{ + .commandBufferCount = 1, + .pCommandBuffers = &*commandBuffer + }; + + // Use mutex to ensure thread-safe access to transfer queue + vk::raii::Fence fence(device, vk::FenceCreateInfo{}); + SubmitToQueue2(*transferQueue, *commandBuffer, false, nullptr, *fence); + (void) waitForFencesSafe(*fence, VK_TRUE); + } catch (const std::exception& e) { + std::cerr << "Failed to copy buffer: " << e.what() << std::endl; + throw; + } +} + +// Create image +std::pair Renderer::createImage( + uint32_t width, + uint32_t height, + vk::Format format, + vk::ImageTiling tiling, + vk::ImageUsageFlags usage, + vk::MemoryPropertyFlags properties) { + try { + // Create image + vk::ImageCreateInfo imageInfo{ + .imageType = vk::ImageType::e2D, + .format = format, + .extent = {width, height, 1}, + .mipLevels = 1, + .arrayLayers = 1, + .samples = vk::SampleCountFlagBits::e1, + .tiling = tiling, + .usage = usage, + .sharingMode = vk::SharingMode::eExclusive, + .initialLayout = vk::ImageLayout::eUndefined + }; + + vk::raii::Image image(device, imageInfo); + + // Allocate memory + vk::MemoryRequirements memRequirements = image.getMemoryRequirements(); + vk::MemoryAllocateInfo allocInfo{ + .allocationSize = memRequirements.size, + .memoryTypeIndex = findMemoryType(memRequirements.memoryTypeBits, properties) + }; + + vk::raii::DeviceMemory imageMemory(device, allocInfo); + + // Bind memory to image + image.bindMemory(*imageMemory, 0); + + return {std::move(image), std::move(imageMemory)}; + } catch (const std::exception& e) { + std::cerr << "Failed to create image: " << e.what() << std::endl; + throw; + } +} + +// Create image using memory pool for efficient allocation +std::pair> Renderer::createImagePooled( + uint32_t width, + uint32_t height, + vk::Format format, + vk::ImageTiling tiling, + vk::ImageUsageFlags usage, + vk::MemoryPropertyFlags properties, + uint32_t mipLevels, + vk::SharingMode sharingMode, + const std::vector& queueFamilies) { + try { + if (!memoryPool) { + throw std::runtime_error("Memory pool not initialized"); + } + + // Use memory pool for allocation (mipmap support limited by memory pool API) + auto [image, allocation] = memoryPool->createImage(width, + height, + format, + tiling, + usage, + properties, + mipLevels, + sharingMode, + queueFamilies); + + return {std::move(image), std::move(allocation)}; + } catch (const std::exception& e) { + std::cerr << "Failed to create image with memory pool: " << e.what() << std::endl; + throw; + } +} + +// Create an image view +vk::raii::ImageView Renderer::createImageView(vk::raii::Image& image, vk::Format format, vk::ImageAspectFlags aspectFlags, uint32_t mipLevels) { + try { + ensureThreadLocalVulkanInit(); + // Create image view + vk::ImageViewCreateInfo viewInfo{ + .image = *image, + .viewType = vk::ImageViewType::e2D, + .format = format, + .subresourceRange = { + .aspectMask = aspectFlags, + .baseMipLevel = 0, + .levelCount = mipLevels, + .baseArrayLayer = 0, + .layerCount = 1 + } + }; + + return {device, viewInfo}; + } catch (const std::exception& e) { + std::cerr << "Failed to create image view: " << e.what() << std::endl; + throw; + } +} + +// Transition image layout +void Renderer::transitionImageLayout(vk::Image image, vk::Format format, vk::ImageLayout oldLayout, vk::ImageLayout newLayout, uint32_t mipLevels) { + ensureThreadLocalVulkanInit(); + try { + // Create a temporary transient command pool and command buffer to isolate per-thread usage + vk::CommandPoolCreateInfo poolInfo{ + .flags = vk::CommandPoolCreateFlagBits::eTransient | vk::CommandPoolCreateFlagBits::eResetCommandBuffer, + .queueFamilyIndex = queueFamilyIndices.graphicsFamily.value() + }; + vk::raii::CommandPool tempPool(device, poolInfo); + vk::CommandBufferAllocateInfo allocInfo{ + .commandPool = *tempPool, + .level = vk::CommandBufferLevel::ePrimary, + .commandBufferCount = 1 + }; + + vk::raii::CommandBuffers commandBuffers(device, allocInfo); + vk::raii::CommandBuffer& commandBuffer = commandBuffers[0]; + + // Begin command buffer + vk::CommandBufferBeginInfo beginInfo{ + .flags = vk::CommandBufferUsageFlagBits::eOneTimeSubmit + }; + + commandBuffer.begin(beginInfo); + + // Create an image barrier (Sync2) + vk::ImageMemoryBarrier2 barrier2{ + .oldLayout = oldLayout, + .newLayout = newLayout, + .srcQueueFamilyIndex = VK_QUEUE_FAMILY_IGNORED, + .dstQueueFamilyIndex = VK_QUEUE_FAMILY_IGNORED, + .image = image, + .subresourceRange = { + .aspectMask = format == vk::Format::eD32Sfloat || format == vk::Format::eD32SfloatS8Uint || format == vk::Format::eD24UnormS8Uint ? vk::ImageAspectFlagBits::eDepth : vk::ImageAspectFlagBits::eColor, + .baseMipLevel = 0, + .levelCount = mipLevels, + .baseArrayLayer = 0, + .layerCount = 1 + } + }; + + // Set stage and access masks based on layouts + if (oldLayout == vk::ImageLayout::eUndefined && newLayout == vk::ImageLayout::eTransferDstOptimal) { + barrier2.srcStageMask = vk::PipelineStageFlagBits2::eTopOfPipe; + barrier2.srcAccessMask = vk::AccessFlagBits2::eNone; + barrier2.dstStageMask = vk::PipelineStageFlagBits2::eTransfer; + barrier2.dstAccessMask = vk::AccessFlagBits2::eTransferWrite; + } else if (oldLayout == vk::ImageLayout::eTransferDstOptimal && newLayout == vk::ImageLayout::eShaderReadOnlyOptimal) { + barrier2.srcStageMask = vk::PipelineStageFlagBits2::eTransfer; + barrier2.srcAccessMask = vk::AccessFlagBits2::eTransferWrite; + barrier2.dstStageMask = vk::PipelineStageFlagBits2::eFragmentShader; + barrier2.dstAccessMask = vk::AccessFlagBits2::eShaderRead; + } else if (oldLayout == vk::ImageLayout::eUndefined && newLayout == vk::ImageLayout::eDepthStencilAttachmentOptimal) { + barrier2.srcStageMask = vk::PipelineStageFlagBits2::eTopOfPipe; + barrier2.srcAccessMask = vk::AccessFlagBits2::eNone; + barrier2.dstStageMask = vk::PipelineStageFlagBits2::eEarlyFragmentTests; + barrier2.dstAccessMask = vk::AccessFlagBits2::eDepthStencilAttachmentRead | vk::AccessFlagBits2::eDepthStencilAttachmentWrite; + } else if (oldLayout == vk::ImageLayout::eUndefined && newLayout == vk::ImageLayout::eDepthStencilReadOnlyOptimal) { + // Support for shadow map creation: transition from undefined to read-only depth layout + barrier2.srcStageMask = vk::PipelineStageFlagBits2::eTopOfPipe; + barrier2.srcAccessMask = vk::AccessFlagBits2::eNone; + barrier2.dstStageMask = vk::PipelineStageFlagBits2::eEarlyFragmentTests; + barrier2.dstAccessMask = vk::AccessFlagBits2::eDepthStencilAttachmentRead; + } else if (oldLayout == vk::ImageLayout::eUndefined && newLayout == vk::ImageLayout::eGeneral) { + // Support for compute shader storage images: transition from undefined to general layout + barrier2.srcStageMask = vk::PipelineStageFlagBits2::eTopOfPipe; + barrier2.srcAccessMask = vk::AccessFlagBits2::eNone; + barrier2.dstStageMask = vk::PipelineStageFlagBits2::eComputeShader; + barrier2.dstAccessMask = vk::AccessFlagBits2::eShaderWrite | vk::AccessFlagBits2::eShaderRead; + } else if (oldLayout == vk::ImageLayout::eUndefined && newLayout == vk::ImageLayout::eShaderReadOnlyOptimal) { + // Support for textures that skip staging buffer (e.g., preloaded, generated, or default textures) + // Transition directly from undefined to shader read-only for sampling + barrier2.srcStageMask = vk::PipelineStageFlagBits2::eTopOfPipe; + barrier2.srcAccessMask = vk::AccessFlagBits2::eNone; + barrier2.dstStageMask = vk::PipelineStageFlagBits2::eFragmentShader; + barrier2.dstAccessMask = vk::AccessFlagBits2::eShaderRead; + } else { + throw std::invalid_argument("Unsupported layout transition!"); + } + + // Add a barrier to command buffer (Sync2) + vk::DependencyInfo depInfo{ + .dependencyFlags = vk::DependencyFlagBits::eByRegion, + .imageMemoryBarrierCount = 1, + .pImageMemoryBarriers = &barrier2 + }; + commandBuffer.pipelineBarrier2(depInfo); + std::cout << "[transitionImageLayout] recorded barrier image=" << (void *) image << " old=" << static_cast(oldLayout) << " new=" << static_cast(newLayout) << std::endl; + + // End command buffer + commandBuffer.end(); + + vk::raii::Fence fence(device, vk::FenceCreateInfo{}); + vk::CommandBufferSubmitInfo cmdInfo{.commandBuffer = *commandBuffer}; + vk::SemaphoreSubmitInfo signalInfo{ + .semaphore = *uploadsTimeline, + .value = 1, // Monotonic value will be set by Submit2 + .stageMask = vk::PipelineStageFlagBits2::eAllCommands + }; + vk::SubmitInfo2 submit2{ + .commandBufferInfoCount = 1, + .pCommandBufferInfos = &cmdInfo, + .signalSemaphoreInfoCount = 0, // Synchronous wait below; no timeline signal needed + .pSignalSemaphoreInfos = nullptr + }; + Submit2(*graphicsQueue, submit2, *fence); + (void) waitForFencesSafe(*fence, VK_TRUE); + } catch (const std::exception& e) { + std::cerr << "Failed to transition image layout: " << e.what() << std::endl; + throw; + } +} + +// Copy buffer to image +void Renderer::copyBufferToImage(vk::Buffer buffer, vk::Image image, uint32_t width, uint32_t height, vk::ArrayProxy regions) { + ensureThreadLocalVulkanInit(); + try { + // Create a temporary transient command pool for the GRAPHICS queue to avoid cross-queue races + vk::CommandPoolCreateInfo poolInfo{ + .flags = vk::CommandPoolCreateFlagBits::eTransient | vk::CommandPoolCreateFlagBits::eResetCommandBuffer, + .queueFamilyIndex = queueFamilyIndices.graphicsFamily.value() + }; + vk::raii::CommandPool tempPool(device, poolInfo); + vk::CommandBufferAllocateInfo allocInfo{ + .commandPool = *tempPool, + .level = vk::CommandBufferLevel::ePrimary, + .commandBufferCount = 1 + }; + + vk::raii::CommandBuffers commandBuffers(device, allocInfo); + vk::raii::CommandBuffer& commandBuffer = commandBuffers[0]; + + // Begin command buffer + vk::CommandBufferBeginInfo beginInfo{ + .flags = vk::CommandBufferUsageFlagBits::eOneTimeSubmit + }; + + commandBuffer.begin(beginInfo); + + // Copy buffer to image using provided regions + commandBuffer.copyBufferToImage( + buffer, + image, + vk::ImageLayout::eTransferDstOptimal, + regions); + std::cout << "[copyBufferToImage] recorded copy img=" << (void *) image << std::endl; + + // End command buffer + commandBuffer.end(); + + vk::raii::Fence fence(device, vk::FenceCreateInfo{}); + vk::CommandBufferSubmitInfo cmdInfo{.commandBuffer = *commandBuffer}; + vk::SemaphoreSubmitInfo signalInfo{ + .semaphore = *uploadsTimeline, + .value = 1, // Monotonic value will be set by Submit2 + .stageMask = vk::PipelineStageFlagBits2::eAllCommands + }; + vk::SubmitInfo2 submit2{ + .commandBufferInfoCount = 1, + .pCommandBufferInfos = &cmdInfo, + .signalSemaphoreInfoCount = 0, // Synchronous wait below; no timeline signal needed + .pSignalSemaphoreInfos = nullptr + }; + Submit2(*graphicsQueue, submit2, *fence); + (void) waitForFencesSafe(*fence, VK_TRUE); + } catch (const std::exception& e) { + std::cerr << "Failed to copy buffer to image: " << e.what() << std::endl; + throw; + } +} + +// Create or resize light storage buffers to accommodate the given number of lights +bool Renderer::createOrResizeLightStorageBuffers(size_t lightCount) { + try { + // Ensure we have storage buffers for each frame in flight + if (lightStorageBuffers.size() != MAX_FRAMES_IN_FLIGHT) { + lightStorageBuffers.resize(MAX_FRAMES_IN_FLIGHT); + } + + // Check if we need to resize buffers + bool needsResize = false; + for (auto& buffer : lightStorageBuffers) { + if (buffer.capacity < lightCount) { + needsResize = true; + break; + } + } + + if (!needsResize) { + return true; // Buffers are already large enough + } + + // Calculate new capacity (with some headroom for growth) + size_t newCapacity = std::max(lightCount * 2, static_cast(64)); + vk::DeviceSize bufferSize = sizeof(LightData) * newCapacity; + + // Wait for device to be idle before destroying old buffers to prevent validation errors. + // External synchronization required (VVL): serialize against queue submits/present. + WaitIdle(); + + // Create new buffers for each frame + for (size_t i = 0; i < MAX_FRAMES_IN_FLIGHT; ++i) { + auto& buffer = lightStorageBuffers[i]; + + // Clean up old buffer if it exists (now safe after waitIdle) + if (!!buffer.allocation) { + buffer.buffer = vk::raii::Buffer(nullptr); + buffer.allocation.reset(); + buffer.mapped = nullptr; + } + + // Create new storage buffer + auto [newBuffer, newAllocation] = createBufferPooled( + bufferSize, + vk::BufferUsageFlagBits::eStorageBuffer, + vk::MemoryPropertyFlagBits::eHostVisible | vk::MemoryPropertyFlagBits::eHostCoherent); + + // Get the mapped pointer from the allocation + void* mapped = newAllocation->mappedPtr; + + // Store the new buffer + buffer.buffer = std::move(newBuffer); + buffer.allocation = std::move(newAllocation); + buffer.mapped = mapped; + buffer.capacity = newCapacity; + buffer.size = 0; + } + + // Update all existing descriptor sets to reference the new light storage buffers + updateAllDescriptorSetsWithNewLightBuffers(); + + // Also refresh Forward+ compute descriptor sets (binding 0) so compute reads valid buffers + try { + if (!forwardPlusPerFrame.empty()) { + for (size_t i = 0; i < forwardPlusPerFrame.size() && i < lightStorageBuffers.size(); ++i) { + if (!*forwardPlusPerFrame[i].computeSet) + continue; + if (!*lightStorageBuffers[i].buffer) + continue; + vk::DescriptorBufferInfo lightsInfo{.buffer = *lightStorageBuffers[i].buffer, .offset = 0, .range = VK_WHOLE_SIZE}; + vk::WriteDescriptorSet write{ + .dstSet = *forwardPlusPerFrame[i].computeSet, + .dstBinding = 0, + .dstArrayElement = 0, + .descriptorCount = 1, + .descriptorType = vk::DescriptorType::eStorageBuffer, + .pBufferInfo = &lightsInfo + }; { + std::lock_guard lk(descriptorMutex); + device.updateDescriptorSets(write, {}); + } + } + } + } catch (const std::exception& e) { + std::cerr << "Failed to update Forward+ compute descriptors after light buffer resize: " << e.what() << std::endl; + } + + return true; + } catch (const std::exception& e) { + std::cerr << "Failed to create or resize light storage buffers: " << e.what() << std::endl; + return false; + } +} + +// Update all existing descriptor sets with new light storage buffer references +void Renderer::updateAllDescriptorSetsWithNewLightBuffers(bool allFrames) { + try { + if (!descriptorSetsValid.load(std::memory_order_relaxed)) + return; + if (isRecordingCmd.load(std::memory_order_relaxed)) + return; + // Iterate through all entity resources and update their PBR descriptor sets + for (auto& kv : entityResources) { + auto& resources = kv.second; + // Only update PBR descriptor sets (they have light buffer bindings) + if (!resources.pbrDescriptorSets.empty()) { + size_t beginFrame = allFrames ? 0 : static_cast(currentFrame); + size_t endFrame = allFrames ? resources.pbrDescriptorSets.size() : (beginFrame + 1); + for (size_t i = beginFrame; i < endFrame && i < resources.pbrDescriptorSets.size() && i < lightStorageBuffers.size(); ++i) { + // Skip if this set looks invalid/uninitialized + if (!(*resources.pbrDescriptorSets[i])) + continue; + if (i < lightStorageBuffers.size() && !!*lightStorageBuffers[i].buffer) { + // Create descriptor write for light storage buffer (binding 7) + vk::DescriptorBufferInfo lightBufferInfo{ + .buffer = *lightStorageBuffers[i].buffer, + .offset = 0, + .range = VK_WHOLE_SIZE + }; + + vk::WriteDescriptorSet descriptorWrite{ + .dstSet = *resources.pbrDescriptorSets[i], + .dstBinding = 6, + .dstArrayElement = 0, + .descriptorCount = 1, + .descriptorType = vk::DescriptorType::eStorageBuffer, + .pBufferInfo = &lightBufferInfo + }; + + // Update the descriptor set + { + std::lock_guard lk(descriptorMutex); + device.updateDescriptorSets(descriptorWrite, {}); + } + } + } + } + } + } catch (const std::exception& e) { + std::cerr << "Failed to update descriptor sets with new light buffers: " << e.what() << std::endl; + } +} + +// Refresh only current frame's PBR descriptor bindings used by Forward+ +// Safe to call after waiting on inFlightFences[currentFrame] and before command recording. +void Renderer::refreshPBRForwardPlusBindingsForFrame(uint32_t frameIndex) { + try { + if (frameIndex >= MAX_FRAMES_IN_FLIGHT) + return; + if (!descriptorSetsValid.load(std::memory_order_relaxed)) + return; + if (isRecordingCmd.load(std::memory_order_relaxed)) + return; + + // Resolve current frame Forward+ buffers + vk::Buffer headersBuf{}; + vk::Buffer indicesBuf{}; + if (frameIndex < forwardPlusPerFrame.size()) { + auto& f = forwardPlusPerFrame[frameIndex]; + if (!!*f.tileHeaders) + headersBuf = *f.tileHeaders; + if (!!*f.tileLightIndices) + indicesBuf = *f.tileLightIndices; + } + + // Resolve current frame lights buffer + vk::Buffer lightsBuf{}; + if (frameIndex < lightStorageBuffers.size() && !!*lightStorageBuffers[frameIndex].buffer) { + lightsBuf = *lightStorageBuffers[frameIndex].buffer; + } + + // Ensure lights buffer exists (binding 6) - create minimal dummy if needed + if (!lightsBuf) { + // Lazily create a minimal lights buffer (single LightData element) for use when Forward+ is disabled + if (lightStorageBuffers.empty()) { + lightStorageBuffers.resize(MAX_FRAMES_IN_FLIGHT); + } + if (frameIndex < lightStorageBuffers.size() && !*lightStorageBuffers[frameIndex].buffer) { + vk::DeviceSize minSize = sizeof(LightData); // Single light element + auto [buf, alloc] = createBufferPooled(minSize, + vk::BufferUsageFlagBits::eStorageBuffer, + vk::MemoryPropertyFlagBits::eHostVisible | vk::MemoryPropertyFlagBits::eHostCoherent); + lightStorageBuffers[frameIndex].buffer = std::move(buf); + lightStorageBuffers[frameIndex].allocation = std::move(alloc); + lightStorageBuffers[frameIndex].mapped = lightStorageBuffers[frameIndex].allocation->mappedPtr; + lightStorageBuffers[frameIndex].capacity = 1; + lightStorageBuffers[frameIndex].size = 0; + // Zero-initialize to prevent garbage data + if (!!lightStorageBuffers[frameIndex].mapped) { + std::memset(lightStorageBuffers[frameIndex].mapped, 0, minSize); + } + } + if (frameIndex < lightStorageBuffers.size() && !!*lightStorageBuffers[frameIndex].buffer) { + lightsBuf = *lightStorageBuffers[frameIndex].buffer; + } + } + + // Ensure tile headers buffer exists (binding 7) - create minimal dummy if needed + if (!headersBuf) { + if (forwardPlusPerFrame.empty()) { + forwardPlusPerFrame.resize(MAX_FRAMES_IN_FLIGHT); + } + if (frameIndex < forwardPlusPerFrame.size()) { + auto& f = forwardPlusPerFrame[frameIndex]; + if (!*f.tileHeaders) { + vk::DeviceSize minSize = sizeof(uint32_t) * 4; // Single TileHeader {offset, count, pad0, pad1} + auto [buf, alloc] = createBufferPooled(minSize, + vk::BufferUsageFlagBits::eStorageBuffer, + vk::MemoryPropertyFlagBits::eHostVisible | vk::MemoryPropertyFlagBits::eHostCoherent); + f.tileHeaders = std::move(buf); + f.tileHeadersAlloc = std::move(alloc); + if (!!f.tileHeadersAlloc && f.tileHeadersAlloc->mappedPtr) { + std::memset(f.tileHeadersAlloc->mappedPtr, 0, minSize); + } + } + if (!!*f.tileHeaders) + headersBuf = *f.tileHeaders; + } + } + + // Ensure tile light indices buffer exists (binding 8) - create minimal dummy if needed + if (!indicesBuf) { + if (forwardPlusPerFrame.empty()) { + forwardPlusPerFrame.resize(MAX_FRAMES_IN_FLIGHT); + } + if (frameIndex < forwardPlusPerFrame.size()) { + auto& f = forwardPlusPerFrame[frameIndex]; + if (!*f.tileLightIndices) { + vk::DeviceSize minSize = sizeof(uint32_t) * 4; // Minimal array of 4 uints + auto [buf, alloc] = createBufferPooled(minSize, + vk::BufferUsageFlagBits::eStorageBuffer, + vk::MemoryPropertyFlagBits::eHostVisible | vk::MemoryPropertyFlagBits::eHostCoherent); + f.tileLightIndices = std::move(buf); + f.tileLightIndicesAlloc = std::move(alloc); + if (!!f.tileLightIndicesAlloc && f.tileLightIndicesAlloc->mappedPtr) { + std::memset(f.tileLightIndicesAlloc->mappedPtr, 0, minSize); + } + } + if (!!*f.tileLightIndices) + indicesBuf = *f.tileLightIndices; + } + } + + std::vector writes; + vk::DescriptorBufferInfo lightsInfo{}; + vk::DescriptorBufferInfo headersInfo{}; + vk::DescriptorBufferInfo indicesInfo{}; + vk::DescriptorBufferInfo geoInfoInfo{}; + vk::DescriptorBufferInfo matInfoInfo{}; + vk::DescriptorBufferInfo fragDbgInfo{}; + + // At this point, all three critical buffers (lights, headers, indices) should exist (real or dummy) + if (!!lightsBuf) { + lightsInfo = vk::DescriptorBufferInfo{.buffer = lightsBuf, .offset = 0, .range = VK_WHOLE_SIZE}; + } + // Current frame fragment debug buffer (reuse compute debugOut) - this one is optional + if (frameIndex < forwardPlusPerFrame.size()) { + auto& fpf = forwardPlusPerFrame[frameIndex]; + if (!!*fpf.debugOut) { + fragDbgInfo = vk::DescriptorBufferInfo{.buffer = *fpf.debugOut, .offset = 0, .range = VK_WHOLE_SIZE}; + } + } + if (!!headersBuf) { + headersInfo = vk::DescriptorBufferInfo{.buffer = headersBuf, .offset = 0, .range = VK_WHOLE_SIZE}; + } + if (!!indicesBuf) { + indicesInfo = vk::DescriptorBufferInfo{.buffer = indicesBuf, .offset = 0, .range = VK_WHOLE_SIZE}; + } + + // Binding 10: reflection sampler — always bind fallback texture while reflection pass is disabled + // The reflection rendering pass is currently disabled (commented out in renderer_rendering.cpp + // lines 1194-1203), so we must not bind any reflection RTs that may exist but contain stale data. + // When reflection rendering is re-enabled, restore the conditional logic to bind previous frame's RT. + vk::DescriptorImageInfo reflInfo{}; + reflInfo = vk::DescriptorImageInfo{.sampler = *defaultTextureResources.textureSampler, .imageView = *defaultTextureResources.textureImageView, .imageLayout = vk::ImageLayout::eShaderReadOnlyOptimal}; + + // Binding 11: TLAS (for raster ray-query shadows) + // Raster PBR shaders can statically declare/use `tlas` even when ray-query mode is disabled, + // so the descriptor must be written whenever acceleration structures are enabled. + vk::AccelerationStructureKHR tlasHandleValue = accelerationStructureEnabled ? *tlasStructure.handle : vk::AccelerationStructureKHR{}; + vk::WriteDescriptorSetAccelerationStructureKHR tlasInfo{}; + tlasInfo.accelerationStructureCount = 1; + tlasInfo.pAccelerationStructures = &tlasHandleValue; + + for (auto& kv : entityResources) { + auto& res = kv.second; + if (res.pbrDescriptorSets.empty() || frameIndex >= res.pbrDescriptorSets.size()) + continue; + + // This prevents "Invalid VkDescriptorSet Object" errors when sets have been freed/invalidated + if (!(*res.pbrDescriptorSets[frameIndex])) { + std::cerr << "Warning: Invalid descriptor set handle for entity at frame " << frameIndex << ", skipping" << std::endl; + continue; + } + + // Binding 6: lights SSBO - ALWAYS bind (required by layout) + if (!!lightsBuf) { + writes.push_back(vk::WriteDescriptorSet{.dstSet = *res.pbrDescriptorSets[frameIndex], .dstBinding = 6, .descriptorCount = 1, .descriptorType = vk::DescriptorType::eStorageBuffer, .pBufferInfo = &lightsInfo}); + } + // Binding 7: tile headers - ALWAYS bind (required by layout) + if (!!headersBuf) { + writes.push_back(vk::WriteDescriptorSet{.dstSet = *res.pbrDescriptorSets[frameIndex], .dstBinding = 7, .descriptorCount = 1, .descriptorType = vk::DescriptorType::eStorageBuffer, .pBufferInfo = &headersInfo}); + } + // Binding 8: tile indices - ALWAYS bind (required by layout) + if (!!indicesBuf) { + writes.push_back(vk::WriteDescriptorSet{.dstSet = *res.pbrDescriptorSets[frameIndex], .dstBinding = 8, .descriptorCount = 1, .descriptorType = vk::DescriptorType::eStorageBuffer, .pBufferInfo = &indicesInfo}); + } + // Binding 9: fragment debug output buffer (optional - only bind if exists) + if (!!fragDbgInfo.buffer) { + writes.push_back(vk::WriteDescriptorSet{.dstSet = *res.pbrDescriptorSets[frameIndex], .dstBinding = 9, .descriptorCount = 1, .descriptorType = vk::DescriptorType::eStorageBuffer, .pBufferInfo = &fragDbgInfo}); + } + // Binding 10: reflection sampler - ALWAYS bind (required by layout) + writes.push_back(vk::WriteDescriptorSet{.dstSet = *res.pbrDescriptorSets[frameIndex], .dstBinding = 10, .descriptorCount = 1, .descriptorType = vk::DescriptorType::eCombinedImageSampler, .pImageInfo = &reflInfo}); + + // Binding 11: TLAS - ALWAYS bind (required by layout when ray query/AS is enabled) + // If TLAS is not built yet, the handle will be null; the shader must not trace when disabled. + vk::WriteDescriptorSet tlasWrite{}; + tlasWrite.dstSet = *res.pbrDescriptorSets[frameIndex]; + tlasWrite.dstBinding = 11; + tlasWrite.dstArrayElement = 0; + tlasWrite.descriptorCount = 1; + tlasWrite.descriptorType = vk::DescriptorType::eAccelerationStructureKHR; + tlasWrite.pNext = &tlasInfo; + writes.push_back(tlasWrite); + + // Binding 12/13: Ray-query geometry/material buffers for material-aware raster shadow queries. + // Always bind something valid; shader guards on `ubo.geometryInfoCount/materialCount`. + vk::Buffer fallbackBuf = headersBuf ? headersBuf : indicesBuf; + vk::Buffer geoBuf = (!!*geometryInfoBuffer) ? *geometryInfoBuffer : fallbackBuf; + vk::Buffer matBuf = (!!*materialBuffer) ? *materialBuffer : fallbackBuf; + geoInfoInfo = vk::DescriptorBufferInfo{.buffer = geoBuf, .offset = 0, .range = VK_WHOLE_SIZE}; + matInfoInfo = vk::DescriptorBufferInfo{.buffer = matBuf, .offset = 0, .range = VK_WHOLE_SIZE}; + writes.push_back(vk::WriteDescriptorSet{.dstSet = *res.pbrDescriptorSets[frameIndex], .dstBinding = 12, .descriptorCount = 1, .descriptorType = vk::DescriptorType::eStorageBuffer, .pBufferInfo = &geoInfoInfo}); + writes.push_back(vk::WriteDescriptorSet{.dstSet = *res.pbrDescriptorSets[frameIndex], .dstBinding = 13, .descriptorCount = 1, .descriptorType = vk::DescriptorType::eStorageBuffer, .pBufferInfo = &matInfoInfo}); + } + + if (!writes.empty()) { + std::lock_guard lk(descriptorMutex); + device.updateDescriptorSets(writes, {}); + } + } catch (const std::exception& e) { + std::cerr << "Failed to refresh PBR Forward+ bindings for frame " << frameIndex << ": " << e.what() << std::endl; + } +} + +// Update the light storage buffer with current light data +bool Renderer::updateLightStorageBuffer(uint32_t frameIndex, const std::vector& lights, CameraComponent* camera) { + try { + // Ensure buffers are large enough and properly initialized + if (!createOrResizeLightStorageBuffers(lights.size())) { + return false; + } + + // Now check frame index after buffers are properly initialized + if (frameIndex >= lightStorageBuffers.size()) { + std::cerr << "Invalid frame index for light storage buffer update: " << frameIndex + << " >= " << lightStorageBuffers.size() << std::endl; + return false; + } + + auto& buffer = lightStorageBuffers[frameIndex]; + if (!buffer.mapped) { + std::cerr << "Light storage buffer not mapped" << std::endl; + return false; + } + + // Convert ExtractedLight data to LightData format + auto* lightData = static_cast(buffer.mapped); + for (size_t i = 0; i < lights.size(); ++i) { + const auto& light = lights[i]; + + // For directional lights, store direction in position field (they don't need position) + // For other lights, store position + if (light.type == ExtractedLight::Type::Directional) { + lightData[i].position = glm::vec4(light.direction, 0.0f); // w=0 indicates direction + } else { + lightData[i].position = glm::vec4(light.position, 1.0f); // w=1 indicates position + } + + lightData[i].color = glm::vec4(light.color * light.intensity, 1.0f); + lightData[i].direction = glm::vec4(light.direction, 0.0f); + + // Calculate light space matrix for shadow mapping + glm::mat4 lightProjection, lightView; + if (light.type == ExtractedLight::Type::Directional) { + float orthoSize = 50.0f; + glm::vec3 shadowCamPos = light.position; + glm::vec3 lightDir = glm::normalize(light.direction); + if (camera) { + // Center shadow map on camera frustum + glm::vec3 camPos = camera->GetPosition(); + shadowCamPos = camPos - lightDir * 50.0f; + } + lightProjection = glm::ortho(-orthoSize, orthoSize, -orthoSize, orthoSize, 0.1f, 200.0f); + + // Robust up vector to avoid LookAt singularities with vertical lights + glm::vec3 up = (std::abs(lightDir.y) > 0.99f) ? glm::vec3(0.0f, 0.0f, 1.0f) : glm::vec3(0.0f, 1.0f, 0.0f); + lightView = glm::lookAt(shadowCamPos, shadowCamPos + lightDir, up); + } else { + lightProjection = glm::perspective(glm::radians(90.0f), 1.0f, 0.1f, light.range); + lightView = glm::lookAt(light.position, light.position + light.direction, glm::vec3(0.0f, 1.0f, 0.0f)); + } + lightData[i].lightSpaceMatrix = lightProjection * lightView; + + // Set light type + switch (light.type) { + case ExtractedLight::Type::Point: + lightData[i].lightType = 0; + break; + case ExtractedLight::Type::Directional: + lightData[i].lightType = 1; + break; + case ExtractedLight::Type::Spot: + lightData[i].lightType = 2; + break; + case ExtractedLight::Type::Emissive: + lightData[i].lightType = 3; + break; + } + + // Set other light properties + lightData[i].range = light.range; + lightData[i].innerConeAngle = light.innerConeAngle; + lightData[i].outerConeAngle = light.outerConeAngle; + } + + // Update buffer size + buffer.size = lights.size(); + + return true; + } catch (const std::exception& e) { + std::cerr << "Failed to update light storage buffer: " << e.what() << std::endl; + return false; + } +} + +// Asynchronous texture loading implementations using ThreadPool +std::future Renderer::LoadTextureAsync(const std::string& texturePath, bool critical) { + if (texturePath.empty()) { + return std::async(std::launch::deferred, [] { return false; }); + } + // Transition UI phase to Textures on the very first scheduled job so the overlay + // doesn’t sit at 0% before any work is actually enqueued. + if (textureTasksScheduled.load(std::memory_order_relaxed) == 0u) { + SetLoadingPhase(LoadingPhase::Textures); + loadingPhaseProgress.store(0.0f, std::memory_order_relaxed); + } + // Schedule a CPU-light job that enqueues a pending GPU upload to be + // processed later on the main thread. This avoids submitting Vulkan + // command buffers from worker threads, which can confuse GPU-assisted + // validation. + textureTasksScheduled.fetch_add(1, std::memory_order_relaxed); + uploadJobsTotal.fetch_add(1, std::memory_order_relaxed); + auto task = [this, texturePath, critical]() { + PendingTextureJob job; + job.type = PendingTextureJob::Type::FromFile; + job.priority = critical ? PendingTextureJob::Priority::Critical : PendingTextureJob::Priority::NonCritical; + job.idOrPath = texturePath; { + std::lock_guard lk(pendingTextureJobsMutex); + pendingTextureJobs.emplace_back(std::move(job)); + } + pendingTextureCv.notify_one(); + if (critical) { + criticalJobsOutstanding.fetch_add(1, std::memory_order_relaxed); + } + return true; + }; + + std::shared_lock lock(threadPoolMutex); + if (!threadPool) { + return std::async(std::launch::async, task); + } + return threadPool->enqueue(task); +} + +std::future Renderer::LoadTextureFromMemoryAsync(const std::string& textureId, + const unsigned char* imageData, + int width, + int height, + int channels, + bool critical) { + if (!imageData || textureId.empty() || width <= 0 || height <= 0 || channels <= 0) { + return std::async(std::launch::deferred, [] { return false; }); + } + // Copy the source bytes so the caller can free/modify their buffer immediately + size_t srcSize = static_cast(width) * static_cast(height) * static_cast(channels); + std::vector dataCopy(srcSize); + std::memcpy(dataCopy.data(), imageData, srcSize); + + if (textureTasksScheduled.load(std::memory_order_relaxed) == 0u) { + SetLoadingPhase(LoadingPhase::Textures); + loadingPhaseProgress.store(0.0f, std::memory_order_relaxed); + } + textureTasksScheduled.fetch_add(1, std::memory_order_relaxed); + uploadJobsTotal.fetch_add(1, std::memory_order_relaxed); + auto task = [this, textureId, data = std::move(dataCopy), width, height, channels, critical]() mutable { + PendingTextureJob job; + job.type = PendingTextureJob::Type::FromMemory; + job.priority = critical ? PendingTextureJob::Priority::Critical : PendingTextureJob::Priority::NonCritical; + job.idOrPath = textureId; + job.data = std::move(data); + job.width = width; + job.height = height; + job.channels = channels; { + std::lock_guard lk(pendingTextureJobsMutex); + pendingTextureJobs.emplace_back(std::move(job)); + } + pendingTextureCv.notify_one(); + if (critical) { + criticalJobsOutstanding.fetch_add(1, std::memory_order_relaxed); + } + return true; + }; + + std::shared_lock lock(threadPoolMutex); + if (!threadPool) { + return std::async(std::launch::async, std::move(task)); + } + return threadPool->enqueue(std::move(task)); +} + +void Renderer::WaitForAllTextureTasks() { + // Simple blocking wait: spin until all scheduled texture tasks have completed. + // This is only intended for use during initial scene loading where a short + // stall is acceptable to ensure descriptor sets see all real textures. + auto start = std::chrono::steady_clock::now(); + uint32_t lastCompleted = 0xFFFFFFFF; + for (;;) { + uint32_t scheduled = textureTasksScheduled.load(std::memory_order_relaxed); + uint32_t completed = textureTasksCompleted.load(std::memory_order_relaxed); + if (scheduled == 0 || completed >= scheduled) { + break; + } + auto now = std::chrono::steady_clock::now(); + if (std::chrono::duration_cast(now - start).count() > 60) { + std::cerr << "WARNING: WaitForAllTextureTasks timed out after 60s! (" << completed << "/" << scheduled << ")" << std::endl; + break; + } + // Sleep briefly to yield CPU while background texture jobs finish + std::this_thread::sleep_for(std::chrono::milliseconds(10)); + } +} + +// Start background worker threads that drain pending texture jobs and perform GPU uploads +void Renderer::StartUploadsWorker(size_t workerCount) { + stopUploadsWorker.store(false, std::memory_order_relaxed); + if (workerCount == 0) { + unsigned int hw = std::thread::hardware_concurrency(); + // Heuristic: at least 2 workers, at most 4, and not exceeding half of HW threads + unsigned int target = std::max(2u, std::min(4u, hw > 0 ? hw / 2 : 2u)); + workerCount = static_cast(target); + } + uploadsWorkerThreads.reserve(workerCount); + for (size_t t = 0; t < workerCount; ++t) { + uploadsWorkerThreads.emplace_back([this]() { + ensureThreadLocalVulkanInit(); + while (!stopUploadsWorker.load(std::memory_order_relaxed)) { + // Wait for work or stop signal + { + std::unique_lock lk(pendingTextureJobsMutex); + pendingTextureCv.wait(lk, + [this]() { + return stopUploadsWorker.load(std::memory_order_relaxed) || !pendingTextureJobs.empty(); + }); + } + if (stopUploadsWorker.load(std::memory_order_relaxed)) + break; + + // Drain a batch of jobs + std::vector batch; { + std::lock_guard lk(pendingTextureJobsMutex); + const size_t maxBatch = 16; // simple batch size to limit command overhead + const size_t take = std::min(maxBatch, pendingTextureJobs.size()); + batch.reserve(take); + for (size_t i = 0; i < take; ++i) { + batch.emplace_back(std::move(pendingTextureJobs.back())); + pendingTextureJobs.pop_back(); + } + } + + // Process critical jobs first + auto isCritical = [](const PendingTextureJob& j) { return j.priority == PendingTextureJob::Priority::Critical; }; + std::stable_sort(batch.begin(), + batch.end(), + [&](const PendingTextureJob& a, const PendingTextureJob& b) { + return isCritical(a) && !isCritical(b); + }); + + // Try to batch FromMemory jobs together for a single transfer submit + std::vector memJobs; + for (auto& j : batch) + if (j.type == PendingTextureJob::Type::FromMemory) + memJobs.push_back(std::move(j)); + // Remove moved jobs from batch + batch.erase(std::remove_if(batch.begin(), batch.end(), [](const PendingTextureJob& j) { return j.type == PendingTextureJob::Type::FromMemory; }), batch.end()); + + if (!memJobs.empty()) { + try { + // Process batched memory uploads with a single submit + // Fallback to per-job if batching fails for any reason + auto processSingle = [&](const PendingTextureJob& job) { + (void) LoadTextureFromMemory(job.idOrPath, + job.data.data(), + job.width, + job.height, + job.channels); + OnTextureUploaded(job.idOrPath); + if (job.priority == PendingTextureJob::Priority::Critical) { + criticalJobsOutstanding.fetch_sub(1, std::memory_order_relaxed); + } + uploadJobsCompleted.fetch_add(1, std::memory_order_relaxed); + // Note: textureTasksCompleted is incremented by OnTextureUploaded + }; + + // Build staging buffers and images without submitting yet + struct Item { + std::string id; + vk::raii::Buffer staging; + std::unique_ptr stagingAlloc; + std::vector rgba; + uint32_t w, h; + vk::Format format; + std::vector regions; + uint32_t mipLevels; + vk::raii::Image image; + std::unique_ptr imageAlloc; + bool success = false; + }; + std::vector items; + items.reserve(memJobs.size()); + + for (auto& job : memJobs) { + try { + // Create staging buffer and copy data + const vk::DeviceSize imgSize = static_cast(job.width * job.height * 4); + auto [stagingBuf, stagingAlloc] = createBufferPooled(imgSize, vk::BufferUsageFlagBits::eTransferSrc, vk::MemoryPropertyFlagBits::eHostVisible | vk::MemoryPropertyFlagBits::eHostCoherent); + void* mapped = stagingAlloc->mappedPtr; + // Convert to RGBA if not already + std::vector rgba; + rgba.resize(static_cast(imgSize)); + const uint8_t* src = job.data.data(); + if (job.channels == 4) { + std::memcpy(rgba.data(), src, static_cast(imgSize)); + } else if (job.channels == 3) { + for (int y = 0; y < job.height; ++y) { + for (int x = 0; x < job.width; ++x) { + size_t si = (y * job.width + x) * 3; + size_t di = (y * job.width + x) * 4; + rgba[di + 0] = src[si + 0]; + rgba[di + 1] = src[si + 1]; + rgba[di + 2] = src[si + 2]; + rgba[di + 3] = 255; + } + } + } else if (job.channels == 1) { + for (int i = 0, n = job.width * job.height; i < n; ++i) { + uint8_t v = src[i]; + size_t di = i * 4; + rgba[di + 0] = v; + rgba[di + 1] = v; + rgba[di + 2] = v; + rgba[di + 3] = 255; + } + } else { + // unsupported layout, fallback to single path which will handle it + processSingle(job); + continue; + } + std::memcpy(mapped, rgba.data(), static_cast(imgSize)); + // Persistent mapping via memory pool; no explicit unmap needed here + + // Create image (concurrent sharing if needed) + bool differentFamilies = queueFamilyIndices.graphicsFamily.value() != queueFamilyIndices.transferFamily.value(); + std::vector families; + if (differentFamilies) + families = {queueFamilyIndices.graphicsFamily.value(), queueFamilyIndices.transferFamily.value()}; + vk::Format texFormat = determineTextureFormat(job.idOrPath); + vk::ImageUsageFlags usage = vk::ImageUsageFlagBits::eTransferDst | vk::ImageUsageFlagBits::eSampled; + // Enable Host Transfer if supported (Vulkan 1.4) + usage |= vk::ImageUsageFlagBits::eHostTransfer; + + auto [image, imageAlloc] = createImagePooled(job.width, job.height, texFormat, vk::ImageTiling::eOptimal, usage, vk::MemoryPropertyFlagBits::eDeviceLocal, 1, differentFamilies ? vk::SharingMode::eConcurrent : vk::SharingMode::eExclusive, families); + + // Prepare one region + std::vector regions{ + vk::BufferImageCopy{ + .bufferOffset = 0, + .bufferRowLength = 0, + .bufferImageHeight = 0, + .imageSubresource = {.aspectMask = vk::ImageAspectFlagBits::eColor, .mipLevel = 0, .baseArrayLayer = 0, .layerCount = 1}, + .imageOffset = {0, 0, 0}, + .imageExtent = {static_cast(job.width), static_cast(job.height), 1} + } + }; + + items.push_back(Item{job.idOrPath, std::move(stagingBuf), std::move(stagingAlloc), std::move(rgba), static_cast(job.width), static_cast(job.height), texFormat, std::move(regions), 1, std::move(image), std::move(imageAlloc)}); + } catch (const std::exception& e) { + std::cerr << "Batch prepare failed for '" << job.idOrPath << "': " << e.what() << ". Falling back to single." << std::endl; + processSingle(job); + continue; + } + } + + if (!items.empty()) { + // MODERN: Use Vulkan 1.4 Host Image Copies (Direct CPU-to-Image Access) + // This completely avoids command buffer recording, pools, and fence waits! + for (auto& it : items) { + try { + // Transition undefined -> General using Host transition + vk::HostImageLayoutTransitionInfoEXT toHostCopy{ + .image = *it.image, + .oldLayout = vk::ImageLayout::eUndefined, + .newLayout = vk::ImageLayout::eGeneral, + .subresourceRange = {.aspectMask = vk::ImageAspectFlagBits::eColor, .baseMipLevel = 0, .levelCount = 1, .baseArrayLayer = 0, .layerCount = 1} + }; + device.transitionImageLayoutEXT(toHostCopy); + + // Copy from host pointer directly to image + vk::MemoryToImageCopyEXT region{ + .pHostPointer = it.rgba.data(), + .imageSubresource = {.aspectMask = vk::ImageAspectFlagBits::eColor, .mipLevel = 0, .baseArrayLayer = 0, .layerCount = 1}, + .imageOffset = {0, 0, 0}, + .imageExtent = {it.w, it.h, 1} + }; + vk::CopyMemoryToImageInfoEXT copyInfo{ + .dstImage = *it.image, + .dstImageLayout = vk::ImageLayout::eGeneral, + .regionCount = 1, + .pRegions = ®ion + }; + device.copyMemoryToImageEXT(copyInfo); + + // Final transition to shader-read for engine use + vk::HostImageLayoutTransitionInfoEXT toShaderRead{ + .image = *it.image, + .oldLayout = vk::ImageLayout::eGeneral, + .newLayout = vk::ImageLayout::eShaderReadOnlyOptimal, + .subresourceRange = {.aspectMask = vk::ImageAspectFlagBits::eColor, .baseMipLevel = 0, .levelCount = 1, .baseArrayLayer = 0, .layerCount = 1} + }; + device.transitionImageLayoutEXT(toShaderRead); + it.success = true; + } catch (const std::exception& e) { + std::cerr << "HostImageCopy failed for '" << it.id << "': " << e.what() << std::endl; + it.success = false; + continue; + } + } + + // Perf accounting for the batch + uint64_t batchBytes = 0; + for (auto& it : items) + batchBytes += static_cast(it.w) * it.h * 4ull; + bytesUploadedTotal.fetch_add(batchBytes, std::memory_order_relaxed); + uploadCount.fetch_add(static_cast(items.size()), std::memory_order_relaxed); + + // Finalize resources and notify + for (auto& it : items) { + if (it.success) { + // Store in textureResources + TextureResources res; + res.textureImage = std::move(it.image); + res.textureImageAllocation = std::move(it.imageAlloc); + res.format = it.format; + res.mipLevels = it.mipLevels; + res.alphaMaskedHint = false; // heuristic omitted in batch + // Create sampler/view + createTextureSampler(res); + res.textureImageView = createImageView(res.textureImage, res.format, vk::ImageAspectFlagBits::eColor, res.mipLevels); { + std::unique_lock lk(textureResourcesMutex); + textureResources[it.id] = std::move(res); + } + OnTextureUploaded(it.id); + // Update counters + uploadJobsCompleted.fetch_add(1, std::memory_order_relaxed); + } + // Note: textureTasksCompleted is incremented by OnTextureUploaded (if success) + // or we increment it here manually if it failed. + if (!it.success) { + textureTasksCompleted.fetch_add(1, std::memory_order_relaxed); + } + } + // Decrement outstanding critical jobs if any + for (auto& job : memJobs) + if (job.priority == PendingTextureJob::Priority::Critical) + criticalJobsOutstanding.fetch_sub(1, std::memory_order_relaxed); + } + } catch (const std::exception& e) { + std::cerr << "UploadsWorker: batch processing failed: " << e.what() << std::endl; + // Fallback: per-job processing + for (auto& job : memJobs) { + try { + (void) LoadTextureFromMemory(job.idOrPath, + job.data.data(), + job.width, + job.height, + job.channels); + OnTextureUploaded(job.idOrPath); + if (job.priority == PendingTextureJob::Priority::Critical) { + criticalJobsOutstanding.fetch_sub(1, std::memory_order_relaxed); + } + uploadJobsCompleted.fetch_add(1, std::memory_order_relaxed); + // textureTasksCompleted handled by OnTextureUploaded + } catch (...) { + textureTasksCompleted.fetch_add(1, std::memory_order_relaxed); + } + } + } + } + + // Process remaining non-memory jobs individually + for (auto& job : batch) { + try { + if (job.type == PendingTextureJob::Type::FromFile) { + (void) LoadTexture(job.idOrPath); + OnTextureUploaded(job.idOrPath); + if (job.priority == PendingTextureJob::Priority::Critical) { + criticalJobsOutstanding.fetch_sub(1, std::memory_order_relaxed); + } + uploadJobsCompleted.fetch_add(1, std::memory_order_relaxed); + // textureTasksCompleted handled by OnTextureUploaded + } + } catch (const std::exception& e) { + std::cerr << "UploadsWorker: failed to process job for '" << job.idOrPath << "': " << e.what() << std::endl; + textureTasksCompleted.fetch_add(1, std::memory_order_relaxed); + } + } + } + }); + } +} + +void Renderer::StopUploadsWorker() { + stopUploadsWorker.store(true, std::memory_order_relaxed); + pendingTextureCv.notify_all(); + for (auto& th : uploadsWorkerThreads) { + if (th.joinable()) + th.join(); + } + uploadsWorkerThreads.clear(); +} + +void Renderer::RegisterTextureUser(const std::string& textureId, Entity* entity) { + if (textureId.empty() || !entity) + return; + + // Always register under the canonical resolved ID so that lookups from + // descriptor creation and upload completion (which also use + // ResolveTextureId) are consistent. + std::string canonicalId = ResolveTextureId(textureId); + if (canonicalId.empty()) { + canonicalId = textureId; + } + + std::lock_guard lk(textureUsersMutex); + textureToEntities[canonicalId].push_back(entity); +} + +void Renderer::OnTextureUploaded(const std::string& textureId) { + // Resolve alias to canonical ID used for tracking and descriptor + // creation. RegisterTextureUser also stores under this canonical ID. + std::string canonicalId = ResolveTextureId(textureId); + if (canonicalId.empty()) { + canonicalId = textureId; + } + + // Increment completed tasks for progress tracking + textureTasksCompleted.fetch_add(1, std::memory_order_relaxed); + + std::vector users; { + std::lock_guard lk(textureUsersMutex); + auto it = textureToEntities.find(canonicalId); + if (it == textureToEntities.end()) { + return; + } + users = it->second; + } + + // Always defer descriptor updates to the safe point at the start of Render() + // (after the in-flight fence for the current frame has been signaled). + // This avoids UPDATE_AFTER_BIND violations and mid-recording invalidation. + // If descriptor indexing / UPDATE_AFTER_BIND is enabled, we still prefer + // this safer path for consistency across devices. + for (Entity* entity : users) { + if (!entity) + continue; + MarkEntityDescriptorsDirty(entity); + } + + // Ray Query uses a global texture table (binding 6) that may reference this texture. + // Mark the ray query descriptor sets dirty for all frames so the render-thread safe point + // can refresh the table when the texture becomes available. + if (rayQueryEnabled && accelerationStructureEnabled) { + const uint32_t allFramesMask = (MAX_FRAMES_IN_FLIGHT >= 32u) ? 0xFFFFFFFFu : ((1u << MAX_FRAMES_IN_FLIGHT) - 1u); + rayQueryDescriptorsDirtyMask.fetch_or(allFramesMask, std::memory_order_relaxed); + } +} + +void Renderer::MarkEntityDescriptorsDirty(Entity* entity) { + if (!entity) + return; + // Mark this entity as needing refresh for *all* frames-in-flight. + // Each frame will refresh its own descriptor sets at its safe point. + const uint32_t allFramesMask = (MAX_FRAMES_IN_FLIGHT >= 32u) ? 0xFFFFFFFFu : ((1u << MAX_FRAMES_IN_FLIGHT) - 1u); + std::lock_guard lk(dirtyEntitiesMutex); + auto& mask = descriptorDirtyEntities[entity]; + mask |= allFramesMask; + + // Also reset the last updated frame counters to force an update next time it's rendered + std::shared_lock entityLock(entityResourcesMutex); + auto it = entityResources.find(entity); + if (it != entityResources.end()) { + it->second.lastUpdatedFrameBasic.assign(MAX_FRAMES_IN_FLIGHT, 0xFFFFFFFFFFFFFFFFULL); + it->second.lastUpdatedFramePBR.assign(MAX_FRAMES_IN_FLIGHT, 0xFFFFFFFFFFFFFFFFULL); + } +} + +bool Renderer::updateDescriptorSetsForFrame(Entity* entity, + const std::string& texturePath, + bool usePBR, + uint32_t frameIndex, + bool imagesOnly, + bool uboOnly) { + std::shared_lock lock(entityResourcesMutex); + auto entityIt = entityResources.find(entity); + if (entityIt == entityResources.end()) + return false; + return updateDescriptorSetsForFrame(entity, entityIt->second, texturePath, usePBR, frameIndex, imagesOnly, uboOnly); +} + +bool Renderer::updateDescriptorSetsForFrame(Entity* entity, + EntityResources& res, + const std::string& texturePath, + bool usePBR, + uint32_t frameIndex, + bool imagesOnly, + bool uboOnly) { + if (!entity) + return false; + if (!descriptorSetsValid.load(std::memory_order_relaxed)) { + // Descriptor sets are being recreated; skip updates for now + return false; + } + // Defer descriptor writes if the command buffer is currently being recorded. + if (isRecordingCmd.load(std::memory_order_relaxed)) { + std::lock_guard qlk(pendingDescMutex); + pendingDescOps.push_back(PendingDescOp{entity, texturePath, usePBR, frameIndex, imagesOnly}); + descriptorRefreshPending.store(true, std::memory_order_relaxed); + return true; + } + // IMPORTANT: Do NOT hold `textureResourcesMutex` across this function. + // We may call `ResolveTextureId()` (which also locks it), and `std::shared_mutex` is not recursive. + + // Ensure we have a valid UBO for this frame before attempting descriptor writes + if (frameIndex >= res.uniformBuffers.size() || + frameIndex >= res.uniformBuffersMapped.size() || + *res.uniformBuffers[frameIndex] == vk::Buffer{}) { + // Missing UBO for this frame; skip to avoid writing invalid descriptors + return false; + } + + vk::DescriptorSetLayout selectedLayout = usePBR ? *pbrDescriptorSetLayout : *descriptorSetLayout; + // Ensure descriptor sets exist for this entity + std::vector layouts(MAX_FRAMES_IN_FLIGHT, selectedLayout); + vk::DescriptorSetAllocateInfo allocInfo{.descriptorPool = *descriptorPool, .descriptorSetCount = MAX_FRAMES_IN_FLIGHT, .pSetLayouts = layouts.data()}; + auto& targetDescriptorSets = usePBR ? res.pbrDescriptorSets : res.basicDescriptorSets; + bool newlyAllocated = false; + if (targetDescriptorSets.empty()) { + std::lock_guard lk(descriptorMutex); + auto sets = vk::raii::DescriptorSets(device, allocInfo); + targetDescriptorSets.clear(); + targetDescriptorSets.reserve(sets.size()); + for (auto& s : sets) { + targetDescriptorSets.emplace_back(std::move(s)); + } + newlyAllocated = true; + } + if (frameIndex >= targetDescriptorSets.size()) + return false; + + vk::DescriptorBufferInfo bufferInfo{.buffer = *res.uniformBuffers[frameIndex], .range = sizeof(UniformBufferObject)}; + + // Ensure per-pipeline UBO init tracking is sized + if (res.pbrUboBindingWritten.size() != MAX_FRAMES_IN_FLIGHT) { + res.pbrUboBindingWritten.assign(MAX_FRAMES_IN_FLIGHT, false); + } + if (res.basicUboBindingWritten.size() != MAX_FRAMES_IN_FLIGHT) { + res.basicUboBindingWritten.assign(MAX_FRAMES_IN_FLIGHT, false); + } + if (res.pbrFixedBindingsWritten.size() != MAX_FRAMES_IN_FLIGHT) { + res.pbrFixedBindingsWritten.assign(MAX_FRAMES_IN_FLIGHT, false); + } + if (res.pbrImagesWritten.size() != MAX_FRAMES_IN_FLIGHT) { + res.pbrImagesWritten.assign(MAX_FRAMES_IN_FLIGHT, false); + } + if (res.basicImagesWritten.size() != MAX_FRAMES_IN_FLIGHT) { + res.basicImagesWritten.assign(MAX_FRAMES_IN_FLIGHT, false); + } + + if (usePBR) { + // We'll fill descriptor writes. Binding 0 (UBO) is written only when explicitly requested (uboOnly) + // or when doing a full update (imagesOnly == false). For imagesOnly updates we must NOT touch UBO + // to avoid update-after-bind hazards. + std::vector writes; + std::array imageInfos; + // Helper: ensure required PBR layout bindings (7/8/10/11) are written at least once per frame. + // IMPORTANT: descriptor infos must remain alive until `updateDescriptorSets` is called. + vk::DescriptorBufferInfo headersInfo{}; + vk::DescriptorBufferInfo indicesInfo{}; + vk::DescriptorBufferInfo geoInfoInfo{}; + vk::DescriptorBufferInfo matInfoInfo{}; + vk::DescriptorImageInfo reflInfo{}; + vk::AccelerationStructureKHR tlasHandleValue{}; + vk::WriteDescriptorSetAccelerationStructureKHR tlasInfo{}; + vk::WriteDescriptorSet tlasWrite{}; + const bool needFixedWrites = !res.pbrFixedBindingsWritten[frameIndex]; + auto appendPbrFixedWrites = [&](std::vector& dstWrites) { + if (!needFixedWrites) + return; + + // Binding 7/8: Forward+ tile buffers (must be valid even when Forward+ is disabled) + if (forwardPlusPerFrame.empty()) { + forwardPlusPerFrame.resize(MAX_FRAMES_IN_FLIGHT); + } + vk::Buffer headersBuf{}; + vk::Buffer indicesBuf{}; + if (frameIndex < forwardPlusPerFrame.size()) { + auto& f = forwardPlusPerFrame[frameIndex]; + if (!!*f.tileHeaders) + headersBuf = *f.tileHeaders; + if (!!*f.tileLightIndices) + indicesBuf = *f.tileLightIndices; + if (!headersBuf) { + vk::DeviceSize minSize = sizeof(uint32_t) * 4; // Single TileHeader + auto [buf, alloc] = createBufferPooled(minSize, + vk::BufferUsageFlagBits::eStorageBuffer, + vk::MemoryPropertyFlagBits::eHostVisible | vk::MemoryPropertyFlagBits::eHostCoherent); + f.tileHeaders = std::move(buf); + f.tileHeadersAlloc = std::move(alloc); + if (!!f.tileHeadersAlloc && f.tileHeadersAlloc->mappedPtr) { + std::memset(f.tileHeadersAlloc->mappedPtr, 0, minSize); + } + headersBuf = *f.tileHeaders; + } + if (!indicesBuf) { + vk::DeviceSize minSize = sizeof(uint32_t) * 4; + auto [buf, alloc] = createBufferPooled(minSize, + vk::BufferUsageFlagBits::eStorageBuffer, + vk::MemoryPropertyFlagBits::eHostVisible | vk::MemoryPropertyFlagBits::eHostCoherent); + f.tileLightIndices = std::move(buf); + f.tileLightIndicesAlloc = std::move(alloc); + if (!!f.tileLightIndicesAlloc && f.tileLightIndicesAlloc->mappedPtr) { + std::memset(f.tileLightIndicesAlloc->mappedPtr, 0, minSize); + } + indicesBuf = *f.tileLightIndices; + } + } + headersInfo = vk::DescriptorBufferInfo{.buffer = headersBuf, .offset = 0, .range = VK_WHOLE_SIZE}; + indicesInfo = vk::DescriptorBufferInfo{.buffer = indicesBuf, .offset = 0, .range = VK_WHOLE_SIZE}; + dstWrites.push_back({.dstSet = *targetDescriptorSets[frameIndex], .dstBinding = 7, .descriptorCount = 1, .descriptorType = vk::DescriptorType::eStorageBuffer, .pBufferInfo = &headersInfo}); + dstWrites.push_back({.dstSet = *targetDescriptorSets[frameIndex], .dstBinding = 8, .descriptorCount = 1, .descriptorType = vk::DescriptorType::eStorageBuffer, .pBufferInfo = &indicesInfo}); + + // Binding 10: reflection sampler (always bind safe fallback) + reflInfo = vk::DescriptorImageInfo{ + .sampler = *defaultTextureResources.textureSampler, + .imageView = *defaultTextureResources.textureImageView, + .imageLayout = vk::ImageLayout::eShaderReadOnlyOptimal + }; + dstWrites.push_back({.dstSet = *targetDescriptorSets[frameIndex], .dstBinding = 10, .descriptorCount = 1, .descriptorType = vk::DescriptorType::eCombinedImageSampler, .pImageInfo = &reflInfo}); + + // Binding 11: TLAS (ray-query shadows in raster PBR fragment shader) + tlasHandleValue = accelerationStructureEnabled ? *tlasStructure.handle : vk::AccelerationStructureKHR{}; + tlasInfo.accelerationStructureCount = 1; + tlasInfo.pAccelerationStructures = &tlasHandleValue; + tlasWrite.dstSet = *targetDescriptorSets[frameIndex]; + tlasWrite.dstBinding = 11; + tlasWrite.dstArrayElement = 0; + tlasWrite.descriptorCount = 1; + tlasWrite.descriptorType = vk::DescriptorType::eAccelerationStructureKHR; + tlasWrite.pNext = &tlasInfo; + dstWrites.push_back(tlasWrite); + + // Binding 12/13: Ray-query geometry/material buffers for material-aware raster shadow queries. + // Always bind something valid; shader guards on `ubo.geometryInfoCount/materialCount`. + vk::Buffer fallbackBuf = headersBuf ? headersBuf : indicesBuf; + vk::Buffer geoBuf = (!!*geometryInfoBuffer) ? *geometryInfoBuffer : fallbackBuf; + vk::Buffer matBuf = (!!*materialBuffer) ? *materialBuffer : fallbackBuf; + geoInfoInfo = vk::DescriptorBufferInfo{.buffer = geoBuf, .offset = 0, .range = VK_WHOLE_SIZE}; + matInfoInfo = vk::DescriptorBufferInfo{.buffer = matBuf, .offset = 0, .range = VK_WHOLE_SIZE}; + dstWrites.push_back({.dstSet = *targetDescriptorSets[frameIndex], .dstBinding = 12, .descriptorCount = 1, .descriptorType = vk::DescriptorType::eStorageBuffer, .pBufferInfo = &geoInfoInfo}); + dstWrites.push_back({.dstSet = *targetDescriptorSets[frameIndex], .dstBinding = 13, .descriptorCount = 1, .descriptorType = vk::DescriptorType::eStorageBuffer, .pBufferInfo = &matInfoInfo}); + }; + + // Optionally write only the UBO (binding 0) — used at safe point to initialize per-frame sets once + if (uboOnly) { + // Avoid re-writing if we already initialized this frame's UBO binding + if (!res.pbrUboBindingWritten[frameIndex]) { + writes.push_back({.dstSet = *targetDescriptorSets[frameIndex], .dstBinding = 0, .descriptorCount = 1, .descriptorType = vk::DescriptorType::eUniformBuffer, .pBufferInfo = &bufferInfo}); + } + appendPbrFixedWrites(writes); + if (!writes.empty()) { + std::lock_guard lk(descriptorMutex); + device.updateDescriptorSets(writes, {}); + if (!res.pbrUboBindingWritten[frameIndex]) { + res.pbrUboBindingWritten[frameIndex] = true; + } + if (needFixedWrites) { + res.pbrFixedBindingsWritten[frameIndex] = true; + } + } + return true; + } + + // For full updates (imagesOnly == false), include UBO write; for imagesOnly, skip it + if (!imagesOnly) { + writes.push_back({.dstSet = *targetDescriptorSets[frameIndex], .dstBinding = 0, .descriptorCount = 1, .descriptorType = vk::DescriptorType::eUniformBuffer, .pBufferInfo = &bufferInfo}); + } + + auto meshComponent = entity->GetComponent(); + // Determine PBR texture paths in the same manner as createDescriptorSets + std::string legacyPath = (meshComponent ? meshComponent->GetTexturePath() : std::string()); + const std::string baseColorPath = (meshComponent && !meshComponent->GetBaseColorTexturePath().empty()) ? meshComponent->GetBaseColorTexturePath() : (!legacyPath.empty() ? legacyPath : SHARED_DEFAULT_ALBEDO_ID); + const std::string mrPath = (meshComponent && !meshComponent->GetMetallicRoughnessTexturePath().empty()) ? meshComponent->GetMetallicRoughnessTexturePath() : SHARED_DEFAULT_METALLIC_ROUGHNESS_ID; + const std::string normalPath = (meshComponent && !meshComponent->GetNormalTexturePath().empty()) ? meshComponent->GetNormalTexturePath() : SHARED_DEFAULT_NORMAL_ID; + const std::string occlusionPath = (meshComponent && !meshComponent->GetOcclusionTexturePath().empty()) ? meshComponent->GetOcclusionTexturePath() : SHARED_DEFAULT_OCCLUSION_ID; + const std::string emissivePath = (meshComponent && !meshComponent->GetEmissiveTexturePath().empty()) ? meshComponent->GetEmissiveTexturePath() : SHARED_DEFAULT_EMISSIVE_ID; + std::array pbrTexturePaths = {baseColorPath, mrPath, normalPath, occlusionPath, emissivePath}; + + for (int j = 0; j < 5; ++j) { + const std::string resolvedBindingPath = ResolveTextureId(pbrTexturePaths[j]); + vk::Sampler samplerHandle{}; + vk::ImageView viewHandle{}; { + std::shared_lock lock(textureResourcesMutex); + auto textureIt = textureResources.find(resolvedBindingPath); + TextureResources* texRes = (textureIt != textureResources.end()) ? &textureIt->second : &defaultTextureResources; + samplerHandle = *texRes->textureSampler; + viewHandle = *texRes->textureImageView; + } + imageInfos[j] = {.sampler = samplerHandle, .imageView = viewHandle, .imageLayout = vk::ImageLayout::eShaderReadOnlyOptimal}; + writes.push_back({.dstSet = *targetDescriptorSets[frameIndex], .dstBinding = static_cast(j + 1), .descriptorCount = 1, .descriptorType = vk::DescriptorType::eCombinedImageSampler, .pImageInfo = &imageInfos[j]}); + } + // Ensure Forward+ light buffer (binding 6) is written for the current frame when available. + // Do this even on imagesOnly updates so set 0 is fully valid for PBR shading. + if (frameIndex < lightStorageBuffers.size() && !!*lightStorageBuffers[frameIndex].buffer) { + vk::DescriptorBufferInfo lightBufferInfo{.buffer = *lightStorageBuffers[frameIndex].buffer, .range = VK_WHOLE_SIZE}; + writes.push_back({.dstSet = *targetDescriptorSets[frameIndex], .dstBinding = 6, .descriptorCount = 1, .descriptorType = vk::DescriptorType::eStorageBuffer, .pBufferInfo = &lightBufferInfo}); + } + appendPbrFixedWrites(writes); { + std::lock_guard lk(descriptorMutex); + device.updateDescriptorSets(writes, {}); + } + if (needFixedWrites) { + res.pbrFixedBindingsWritten[frameIndex] = true; + } + if (!imagesOnly) { + res.pbrUboBindingWritten[frameIndex] = true; + } + } else { + const std::string resolvedTexturePath = ResolveTextureId(texturePath); + vk::Sampler samplerHandle{}; + vk::ImageView viewHandle{}; { + std::shared_lock lock(textureResourcesMutex); + auto textureIt = textureResources.find(resolvedTexturePath); + TextureResources* texRes = (textureIt != textureResources.end()) ? &textureIt->second : &defaultTextureResources; + samplerHandle = *texRes->textureSampler; + viewHandle = *texRes->textureImageView; + } + vk::DescriptorImageInfo imageInfo{.sampler = samplerHandle, .imageView = viewHandle, .imageLayout = vk::ImageLayout::eShaderReadOnlyOptimal}; + if (imagesOnly && !newlyAllocated) { + std::array descriptorWrites = { + vk::WriteDescriptorSet{.dstSet = *targetDescriptorSets[frameIndex], .dstBinding = 1, .descriptorCount = 1, .descriptorType = vk::DescriptorType::eCombinedImageSampler, .pImageInfo = &imageInfo} + }; { + std::lock_guard lk(descriptorMutex); + device.updateDescriptorSets(descriptorWrites, {}); + } + } else { + // If uboOnly is requested for basic pipeline, only write binding 0 + if (uboOnly) { + if (!res.basicUboBindingWritten[frameIndex]) { + std::array descriptorWrites = { + vk::WriteDescriptorSet{.dstSet = *targetDescriptorSets[frameIndex], .dstBinding = 0, .descriptorCount = 1, .descriptorType = vk::DescriptorType::eUniformBuffer, .pBufferInfo = &bufferInfo} + }; { + std::lock_guard lk(descriptorMutex); + device.updateDescriptorSets(descriptorWrites, {}); + } + res.basicUboBindingWritten[frameIndex] = true; + } + return true; + } + std::array descriptorWrites = { + vk::WriteDescriptorSet{.dstSet = *targetDescriptorSets[frameIndex], .dstBinding = 0, .descriptorCount = 1, .descriptorType = vk::DescriptorType::eUniformBuffer, .pBufferInfo = &bufferInfo}, + vk::WriteDescriptorSet{.dstSet = *targetDescriptorSets[frameIndex], .dstBinding = 1, .descriptorCount = 1, .descriptorType = vk::DescriptorType::eCombinedImageSampler, .pImageInfo = &imageInfo} + }; { + std::lock_guard lk(descriptorMutex); + device.updateDescriptorSets(descriptorWrites, {}); + } + res.basicUboBindingWritten[frameIndex] = true; + } + } + return true; +} + +void Renderer::ProcessDirtyDescriptorsForFrame(uint32_t frameIndex) { + if (frameIndex >= 32u) + return; + const uint32_t frameBit = (1u << frameIndex); + + std::vector toProcess; { + std::lock_guard lk(dirtyEntitiesMutex); + if (descriptorDirtyEntities.empty()) + return; + toProcess.reserve(descriptorDirtyEntities.size()); + for (auto& [e, mask] : descriptorDirtyEntities) { + if (!!e && (mask & frameBit)) { + toProcess.push_back(e); + } + } + } + + uint32_t processed = 0; + for (Entity* entity : toProcess) { + if (!entity) + continue; + + // Kick watchdog periodically during heavy descriptor processing + if (++processed % 10 == 0) { + KickWatchdog(); + } + + auto meshComponent = entity->GetComponent(); + if (!meshComponent) + continue; + // Resolve a texture path to pass for the basic pipeline + std::string basicTexPath = meshComponent->GetTexturePath(); + if (basicTexPath.empty()) + basicTexPath = meshComponent->GetBaseColorTexturePath(); + // Update strategy: + // - Only update the current frame here at the safe point. + // Other frames will be updated at their own safe points to avoid UPDATE_AFTER_BIND violations. + { + std::shared_lock lock(entityResourcesMutex); + auto entityIt = entityResources.find(entity); + if (entityIt != entityResources.end()) { + updateDescriptorSetsForFrame(entity, entityIt->second, basicTexPath, false, frameIndex, /*imagesOnly=*/true); + updateDescriptorSetsForFrame(entity, entityIt->second, basicTexPath, true, frameIndex, /*imagesOnly=*/true); + } + } + // Do not touch descriptors for other frames while their command buffers may be pending. + } + + // Clear the processed bit; keep entities dirty until all frames have been refreshed. + { + std::lock_guard lk(dirtyEntitiesMutex); + for (Entity* entity : toProcess) { + auto it = descriptorDirtyEntities.find(entity); + if (it == descriptorDirtyEntities.end()) + continue; + it->second &= ~frameBit; + if (it->second == 0u) { + descriptorDirtyEntities.erase(it); + } + } + } +} + +void Renderer::ProcessPendingTextureJobs(uint32_t maxJobs, + bool includeCritical, + bool includeNonCritical) { + // If the background uploads worker is running, it normally handles draining jobs. + // However, to guarantee forward progress (and avoid UI stalls if a worker is backlogged), + // we still drain a small bounded number of jobs on the render thread. + // Do NOT early-return here. + // Drain the pending job list under lock into a local vector, then + // perform a bounded number of texture loads (including Vulkan work) + // on this thread. This must be called from the main/render thread. + std::vector jobs; { + std::lock_guard lk(pendingTextureJobsMutex); + if (pendingTextureJobs.empty()) { + return; + } + jobs.swap(pendingTextureJobs); + } + + std::vector remaining; + remaining.reserve(jobs.size()); + + uint32_t processed = 0; + uint32_t watchdogCounter = 0; + for (auto& job : jobs) { + // Kick watchdog periodically during heavy texture processing + if (++watchdogCounter % 10 == 0) { + KickWatchdog(); + } + + const bool isCritical = (job.priority == PendingTextureJob::Priority::Critical); + if (processed < maxJobs && + ((isCritical && includeCritical) || (!isCritical && includeNonCritical))) { + switch (job.type) { + case PendingTextureJob::Type::FromFile: + // LoadTexture will resolve aliases and perform full GPU upload + LoadTexture(job.idOrPath); + break; + case PendingTextureJob::Type::FromMemory: + // LoadTextureFromMemory will create GPU resources for this ID + LoadTextureFromMemory(job.idOrPath, + job.data.data(), + job.width, + job.height, + job.channels); + break; + } + // Refresh descriptors for entities that use this texture so + // streaming uploads become visible in the scene. + OnTextureUploaded(job.idOrPath); + if (isCritical) { + criticalJobsOutstanding.fetch_sub(1, std::memory_order_relaxed); + } + uploadJobsCompleted.fetch_add(1, std::memory_order_relaxed); + ++processed; + } else { + remaining.emplace_back(std::move(job)); + } + } + + if (!remaining.empty()) { + std::lock_guard lk(pendingTextureJobsMutex); + // Append remaining jobs back to the pending queue + pendingTextureJobs.insert(pendingTextureJobs.end(), + std::make_move_iterator(remaining.begin()), + std::make_move_iterator(remaining.end())); + } +} + +// Record both layout transitions and the copy in a single submission with a fence +void Renderer::uploadImageFromStaging(vk::Buffer staging, + vk::Image image, + vk::Format format, + vk::ArrayProxy regions, + uint32_t mipLevels, + vk::DeviceSize stagedBytes) { + ensureThreadLocalVulkanInit(); + try { + // Start perf window on first upload + if (uploadWindowStartNs.load(std::memory_order_relaxed) == 0) { + auto now = std::chrono::steady_clock::now().time_since_epoch(); + uint64_t nowNs = static_cast(std::chrono::duration_cast(now).count()); + uploadWindowStartNs.store(nowNs, std::memory_order_relaxed); + } + auto t0 = std::chrono::steady_clock::now(); + + // Choose a queue family for uploads: prefer dedicated TRANSFER if available, else use GRAPHICS + bool hasTransferFamily = queueFamilyIndices.transferFamily.has_value(); + uint32_t uploadQueueFamily = hasTransferFamily ? queueFamilyIndices.transferFamily.value() : queueFamilyIndices.graphicsFamily.value(); + bool useTransferQueue = hasTransferFamily && !!*transferQueue; + + // Use a temporary transient command pool for the chosen queue family + vk::CommandPoolCreateInfo poolInfo{ + .flags = vk::CommandPoolCreateFlagBits::eTransient | vk::CommandPoolCreateFlagBits::eResetCommandBuffer, + .queueFamilyIndex = uploadQueueFamily + }; + vk::raii::CommandPool tempPool(device, poolInfo); + vk::CommandBufferAllocateInfo allocInfo{ + .commandPool = *tempPool, + .level = vk::CommandBufferLevel::ePrimary, + .commandBufferCount = 1 + }; + vk::raii::CommandBuffers cbs(device, allocInfo); + vk::raii::CommandBuffer& cb = cbs[0]; + + vk::CommandBufferBeginInfo beginInfo{.flags = vk::CommandBufferUsageFlagBits::eOneTimeSubmit}; + cb.begin(beginInfo); + + // Barrier: Undefined -> TransferDstOptimal (all mip levels that will be copied) (Sync2) + vk::ImageMemoryBarrier2 toTransfer2{ + .srcStageMask = vk::PipelineStageFlagBits2::eTopOfPipe, + .srcAccessMask = vk::AccessFlagBits2::eNone, + .dstStageMask = vk::PipelineStageFlagBits2::eTransfer, + .dstAccessMask = vk::AccessFlagBits2::eTransferWrite, + .oldLayout = vk::ImageLayout::eUndefined, + .newLayout = vk::ImageLayout::eTransferDstOptimal, + .srcQueueFamilyIndex = VK_QUEUE_FAMILY_IGNORED, + .dstQueueFamilyIndex = VK_QUEUE_FAMILY_IGNORED, + .image = image, + .subresourceRange = { + .aspectMask = (format == vk::Format::eD32Sfloat || format == vk::Format::eD32SfloatS8Uint || format == vk::Format::eD24UnormS8Uint) ? vk::ImageAspectFlagBits::eDepth : vk::ImageAspectFlagBits::eColor, + .baseMipLevel = 0, + .levelCount = mipLevels, + .baseArrayLayer = 0, + .layerCount = 1 + } + }; + vk::DependencyInfo depToTransfer{.dependencyFlags = vk::DependencyFlagBits::eByRegion, .imageMemoryBarrierCount = 1, .pImageMemoryBarriers = &toTransfer2}; + cb.pipelineBarrier2(depToTransfer); + // Copy + cb.copyBufferToImage(staging, image, vk::ImageLayout::eTransferDstOptimal, regions); + // After copy, if we'll generate mips, keep level 0 in TRANSFER_SRC and leave others in TRANSFER_DST. + // Else transition ALL levels to SHADER_READ_ONLY. (Sync2) + const bool willGenerateMips = (mipLevels > 1 && regions.size() == 1); + if (willGenerateMips) { + vk::ImageMemoryBarrier2 postCopy2{ + .srcStageMask = vk::PipelineStageFlagBits2::eTransfer, + .srcAccessMask = vk::AccessFlagBits2::eTransferWrite, + .dstStageMask = vk::PipelineStageFlagBits2::eTransfer, + .dstAccessMask = vk::AccessFlagBits2::eNone, + .oldLayout = vk::ImageLayout::eTransferDstOptimal, + .newLayout = vk::ImageLayout::eTransferSrcOptimal, + .srcQueueFamilyIndex = VK_QUEUE_FAMILY_IGNORED, + .dstQueueFamilyIndex = VK_QUEUE_FAMILY_IGNORED, + .image = image, + .subresourceRange = { + .aspectMask = (format == vk::Format::eD32Sfloat || format == vk::Format::eD32SfloatS8Uint || format == vk::Format::eD24UnormS8Uint) ? vk::ImageAspectFlagBits::eDepth : vk::ImageAspectFlagBits::eColor, + .baseMipLevel = 0, + .levelCount = 1, + .baseArrayLayer = 0, + .layerCount = 1 + } + }; + vk::DependencyInfo depPostCopy{.dependencyFlags = vk::DependencyFlagBits::eByRegion, .imageMemoryBarrierCount = 1, .pImageMemoryBarriers = &postCopy2}; + cb.pipelineBarrier2(depPostCopy); + } else { + vk::ImageMemoryBarrier2 allToSample{ + .srcStageMask = vk::PipelineStageFlagBits2::eTransfer, + .srcAccessMask = vk::AccessFlagBits2::eTransferWrite, + .dstStageMask = vk::PipelineStageFlagBits2::eTransfer, + .dstAccessMask = vk::AccessFlagBits2::eNone, + .oldLayout = vk::ImageLayout::eTransferDstOptimal, + .newLayout = vk::ImageLayout::eShaderReadOnlyOptimal, + .srcQueueFamilyIndex = VK_QUEUE_FAMILY_IGNORED, + .dstQueueFamilyIndex = VK_QUEUE_FAMILY_IGNORED, + .image = image, + .subresourceRange = { + .aspectMask = (format == vk::Format::eD32Sfloat || format == vk::Format::eD32SfloatS8Uint || format == vk::Format::eD24UnormS8Uint) ? vk::ImageAspectFlagBits::eDepth : vk::ImageAspectFlagBits::eColor, + .baseMipLevel = 0, + .levelCount = mipLevels, + .baseArrayLayer = 0, + .layerCount = 1 + } + }; + vk::DependencyInfo depAllToSample{.dependencyFlags = vk::DependencyFlagBits::eByRegion, .imageMemoryBarrierCount = 1, .pImageMemoryBarriers = &allToSample}; + cb.pipelineBarrier2(depAllToSample); + } + cb.end(); + + // Submit once on the TRANSFER queue; signal uploads timeline if available + vk::raii::Fence fence(device, vk::FenceCreateInfo{}); + bool canSignalTimeline = false; // Synchronous wait below; no timeline signal needed + uint64_t signalValue = 0; + SubmitToQueue2(useTransferQueue ? *transferQueue : *graphicsQueue, *cb, canSignalTimeline, &signalValue, *fence); + (void) waitForFencesSafe(*fence, VK_TRUE); + + // Perf accounting + auto t1 = std::chrono::steady_clock::now(); + auto ns = std::chrono::duration_cast(t1 - t0).count(); + totalUploadNs.fetch_add(static_cast(ns), std::memory_order_relaxed); + uploadCount.fetch_add(1, std::memory_order_relaxed); + if (stagedBytes > 0) { + bytesUploadedTotal.fetch_add(static_cast(stagedBytes), std::memory_order_relaxed); + } + } catch (const std::exception& e) { + std::cerr << "uploadImageFromStaging failed: " << e.what() << std::endl; + throw; + } +} + +// Generate full mip chain with linear blits (RGBA formats). Assumes level 0 is in TRANSFER_SRC_OPTIMAL. +void Renderer::generateMipmaps(vk::Image image, + vk::Format format, + int32_t texWidth, + int32_t texHeight, + uint32_t mipLevels) { + ensureThreadLocalVulkanInit(); + // Verify format supports linear blit + auto props = physicalDevice.getFormatProperties(format); + if ((props.optimalTilingFeatures & vk::FormatFeatureFlagBits::eSampledImageFilterLinear) == vk::FormatFeatureFlags{}) { + return; // no linear filter support; skip + } + + vk::CommandPoolCreateInfo poolInfo{.flags = vk::CommandPoolCreateFlagBits::eTransient | vk::CommandPoolCreateFlagBits::eResetCommandBuffer, .queueFamilyIndex = queueFamilyIndices.graphicsFamily.value()}; + vk::raii::CommandPool tempPool(device, poolInfo); + vk::CommandBufferAllocateInfo allocInfo{.commandPool = *tempPool, .level = vk::CommandBufferLevel::ePrimary, .commandBufferCount = 1}; + vk::raii::CommandBuffers cbs(device, allocInfo); + vk::raii::CommandBuffer& cb = cbs[0]; + cb.begin({.flags = vk::CommandBufferUsageFlagBits::eOneTimeSubmit}); + + int32_t mipW = texWidth; + int32_t mipH = texHeight; + for (uint32_t i = 1; i < mipLevels; ++i) { + // Transition level i to TRANSFER_DST (Sync2) + vk::ImageMemoryBarrier2 toDst2{ + .srcStageMask = vk::PipelineStageFlagBits2::eTopOfPipe, .srcAccessMask = vk::AccessFlagBits2::eNone, .dstStageMask = vk::PipelineStageFlagBits2::eTransfer, .dstAccessMask = vk::AccessFlagBits2::eTransferWrite, .oldLayout = vk::ImageLayout::eUndefined, .newLayout = vk::ImageLayout::eTransferDstOptimal, .srcQueueFamilyIndex = VK_QUEUE_FAMILY_IGNORED, .dstQueueFamilyIndex = VK_QUEUE_FAMILY_IGNORED, .image = image, + .subresourceRange = {vk::ImageAspectFlagBits::eColor, i, 1, 0, 1} + }; + vk::DependencyInfo depToDst{.dependencyFlags = vk::DependencyFlagBits::eByRegion, .imageMemoryBarrierCount = 1, .pImageMemoryBarriers = &toDst2}; + cb.pipelineBarrier2(depToDst); + + // Blit from i-1 to i + vk::ImageBlit blit{}; + blit.srcSubresource.aspectMask = vk::ImageAspectFlagBits::eColor; + blit.srcSubresource.mipLevel = i - 1; + blit.srcSubresource.baseArrayLayer = 0; + blit.srcSubresource.layerCount = 1; + blit.srcOffsets[0] = vk::Offset3D{0, 0, 0}; + blit.srcOffsets[1] = vk::Offset3D{mipW, mipH, 1}; + blit.dstSubresource.aspectMask = vk::ImageAspectFlagBits::eColor; + blit.dstSubresource.mipLevel = i; + blit.dstSubresource.baseArrayLayer = 0; + blit.dstSubresource.layerCount = 1; + blit.dstOffsets[0] = vk::Offset3D{0, 0, 0}; + blit.dstOffsets[1] = vk::Offset3D{std::max(1, mipW / 2), std::max(1, mipH / 2), 1}; + cb.blitImage(image, vk::ImageLayout::eTransferSrcOptimal, image, vk::ImageLayout::eTransferDstOptimal, blit, vk::Filter::eLinear); + + // Transition previous level to SHADER_READ_ONLY (Sync2) + vk::ImageMemoryBarrier2 prevToRead2{ + .srcStageMask = vk::PipelineStageFlagBits2::eTransfer, .srcAccessMask = vk::AccessFlagBits2::eTransferRead, .dstStageMask = vk::PipelineStageFlagBits2::eFragmentShader, .dstAccessMask = vk::AccessFlagBits2::eShaderRead, .oldLayout = vk::ImageLayout::eTransferSrcOptimal, .newLayout = vk::ImageLayout::eShaderReadOnlyOptimal, .srcQueueFamilyIndex = VK_QUEUE_FAMILY_IGNORED, + .dstQueueFamilyIndex = VK_QUEUE_FAMILY_IGNORED, .image = image, .subresourceRange = {vk::ImageAspectFlagBits::eColor, i - 1, 1, 0, 1} + }; + vk::DependencyInfo depPrevToRead{.dependencyFlags = vk::DependencyFlagBits::eByRegion, .imageMemoryBarrierCount = 1, .pImageMemoryBarriers = &prevToRead2}; + cb.pipelineBarrier2(depPrevToRead); + + mipW = std::max(1, mipW / 2); + mipH = std::max(1, mipH / 2); + } + // Transition last level to SHADER_READ_ONLY (Sync2) + vk::ImageMemoryBarrier2 lastToRead2{ + .srcStageMask = vk::PipelineStageFlagBits2::eTransfer, .srcAccessMask = vk::AccessFlagBits2::eTransferWrite, .dstStageMask = vk::PipelineStageFlagBits2::eFragmentShader, .dstAccessMask = vk::AccessFlagBits2::eShaderRead, .oldLayout = vk::ImageLayout::eTransferDstOptimal, .newLayout = vk::ImageLayout::eShaderReadOnlyOptimal, .srcQueueFamilyIndex = VK_QUEUE_FAMILY_IGNORED, + .dstQueueFamilyIndex = VK_QUEUE_FAMILY_IGNORED, .image = image, .subresourceRange = {vk::ImageAspectFlagBits::eColor, mipLevels - 1, 1, 0, 1} + }; + vk::DependencyInfo depLastToRead{.dependencyFlags = vk::DependencyFlagBits::eByRegion, .imageMemoryBarrierCount = 1, .pImageMemoryBarriers = &lastToRead2}; + cb.pipelineBarrier2(depLastToRead); + + cb.end(); + + vk::raii::Fence fence(device, vk::FenceCreateInfo{}); + bool canSignalTimeline = false; // Synchronous wait below; no timeline signal needed + SubmitToQueue2(*graphicsQueue, *cb, canSignalTimeline, nullptr, *fence); + (void) waitForFencesSafe(*fence, VK_TRUE); +} diff --git a/attachments/sync2_engine/renderer_utils.cpp b/attachments/sync2_engine/renderer_utils.cpp new file mode 100644 index 00000000..4ed303bc --- /dev/null +++ b/attachments/sync2_engine/renderer_utils.cpp @@ -0,0 +1,397 @@ +/* Copyright (c) 2025 Holochip Corporation + * + * SPDX-License-Identifier: Apache-2.0 + * + * Licensed under the Apache License, Version 2.0 the "License"; + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +#include "renderer.h" +#include +#include +#include +#include +#include +#include + +// This file contains utility methods from the Renderer class + +// Find memory type +uint32_t Renderer::findMemoryType(uint32_t typeFilter, vk::MemoryPropertyFlags properties) const { + try { + // Get memory properties + vk::PhysicalDeviceMemoryProperties memProperties = physicalDevice.getMemoryProperties(); + + // Find suitable memory type + for (uint32_t i = 0; i < memProperties.memoryTypeCount; i++) { + if ((typeFilter & (1 << i)) && (memProperties.memoryTypes[i].propertyFlags & properties) == properties) { + return i; + } + } + + throw std::runtime_error("Failed to find suitable memory type"); + } catch (const std::exception& e) { + std::cerr << "Failed to find memory type: " << e.what() << std::endl; + throw; + } +} + +// Find supported format +vk::Format Renderer::findSupportedFormat(const std::vector& candidates, vk::ImageTiling tiling, vk::FormatFeatureFlags features) { + try { + for (vk::Format format : candidates) { + vk::FormatProperties props = physicalDevice.getFormatProperties(format); + + if (tiling == vk::ImageTiling::eLinear && (props.linearTilingFeatures & features) == features) { + return format; + } else if (tiling == vk::ImageTiling::eOptimal && (props.optimalTilingFeatures & features) == features) { + return format; + } + } + + throw std::runtime_error("Failed to find supported format"); + } catch (const std::exception& e) { + std::cerr << "Failed to find supported format: " << e.what() << std::endl; + throw; + } +} + +// Find depth format +vk::Format Renderer::findDepthFormat() { + try { + vk::Format depthFormat = findSupportedFormat( + {vk::Format::eD32Sfloat, vk::Format::eD32SfloatS8Uint, vk::Format::eD24UnormS8Uint}, + vk::ImageTiling::eOptimal, + vk::FormatFeatureFlagBits::eDepthStencilAttachment); + std::cout << "Found depth format: " << static_cast(depthFormat) << std::endl; + return depthFormat; + } catch (const std::exception& e) { + std::cerr << "Failed to find supported depth format, falling back to D32_SFLOAT: " << e.what() << std::endl; + // Fallback to D32_SFLOAT which is widely supported + return vk::Format::eD32Sfloat; + } +} + +// Check if format has stencil component +bool Renderer::hasStencilComponent(vk::Format format) { + return format == vk::Format::eD32SfloatS8Uint || format == vk::Format::eD24UnormS8Uint; +} + +// Resolve path +std::string Renderer::ResolvePath(const std::string& filename) const { + if (filename.empty()) return ""; + + // If it's already an absolute path, just check if it exists and return it + if (std::filesystem::path(filename).is_absolute()) { + if (std::filesystem::exists(filename)) { + return filename; + } + } + + std::vector searchPaths = { + filename, + "cmake-build-debug/" + filename, + "cmake-build-release/" + filename, + "build/" + filename, + "../" + filename, + "../../" + filename, + "../simple_engine/" + filename, + "../simple_engine/cmake-build-debug/" + filename, + "sync2_engine/cmake-build-debug/" + filename, + "simple_engine/cmake-build-debug/" + filename, + "../sync2_engine/cmake-build-debug/" + filename, + "assets/" + filename, + "../assets/" + filename, + "../../assets/" + filename, + "Assets/" + filename, + "../Assets/" + filename, + "../../Assets/" + filename + }; + + for (const auto& path : searchPaths) { + try { + if (std::filesystem::exists(path)) { + return path; + } + } catch (...) { + // Ignore errors from weird path combinations + } + } + + return filename; // Fallback +} + +// Read file +std::vector Renderer::readFile(const std::string& filename) { + std::string resolvedPath = ResolvePath(filename); + + std::ifstream file(resolvedPath, std::ios::ate | std::ios::binary); + if (file.is_open()) { + size_t fileSize = file.tellg(); + std::vector buffer(fileSize); + file.seekg(0); + file.read(buffer.data(), fileSize); + file.close(); + return buffer; + } + + std::cerr << "CRITICAL: Failed to open file: " << resolvedPath << " (original name: " << filename << ")" << std::endl; + throw std::runtime_error("Failed to open file in any search path: " + filename); +} + +// Create shader module +vk::raii::ShaderModule Renderer::createShaderModule(const std::vector& code) { + try { + // Create shader module + vk::ShaderModuleCreateInfo createInfo{ + .codeSize = code.size(), + .pCode = reinterpret_cast(code.data()) + }; + + return vk::raii::ShaderModule(device, createInfo); + } catch (const std::exception& e) { + std::cerr << "Failed to create shader module: " << e.what() << std::endl; + throw; + } +} + +// Find queue families +QueueFamilyIndices Renderer::findQueueFamilies(const vk::raii::PhysicalDevice& device) { + QueueFamilyIndices indices; + + // Get queue family properties + std::vector queueFamilies = device.getQueueFamilyProperties(); + + // Find queue families that support graphics, compute, present, and (optionally) a dedicated transfer queue + for (uint32_t i = 0; i < queueFamilies.size(); i++) { + const auto& qf = queueFamilies[i]; + // Check for graphics support + if ((qf.queueFlags & vk::QueueFlagBits::eGraphics) && !indices.graphicsFamily.has_value()) { + indices.graphicsFamily = i; + } + // Check for compute support + if ((qf.queueFlags & vk::QueueFlagBits::eCompute) && !indices.computeFamily.has_value()) { + indices.computeFamily = i; + } + // Check for present support + if (!indices.presentFamily.has_value() && device.getSurfaceSupportKHR(i, *surface)) { + indices.presentFamily = i; + } + // Prefer a dedicated transfer queue (transfer bit set, but NOT graphics) if available + if ((qf.queueFlags & vk::QueueFlagBits::eTransfer) && !(qf.queueFlags & vk::QueueFlagBits::eGraphics)) { + if (!indices.transferFamily.has_value()) { + indices.transferFamily = i; + } + } + // If all required queue families are found, we can still continue to try find a dedicated transfer queue + if (indices.isComplete() && indices.transferFamily.has_value()) { + // Found everything including dedicated transfer + break; + } + } + + // Fallback: if no dedicated transfer queue, reuse graphics queue for transfer + if (!indices.transferFamily.has_value() && indices.graphicsFamily.has_value()) { + indices.transferFamily = indices.graphicsFamily; + } + + return indices; +} + +// Query swap chain support +SwapChainSupportDetails Renderer::querySwapChainSupport(const vk::raii::PhysicalDevice& device) { + SwapChainSupportDetails details; + + // Get surface capabilities + details.capabilities = device.getSurfaceCapabilitiesKHR(*surface); + + // Get surface formats + details.formats = device.getSurfaceFormatsKHR(*surface); + + // Get present modes + details.presentModes = device.getSurfacePresentModesKHR(*surface); + + return details; +} + +// Check device extension support +bool Renderer::checkDeviceExtensionSupport(vk::raii::PhysicalDevice& device) { + auto availableDeviceExtensions = device.enumerateDeviceExtensionProperties(); + + // Check if all required extensions are supported + std::set requiredExtensionsSet(requiredDeviceExtensions.begin(), requiredDeviceExtensions.end()); + + for (const auto& extension : availableDeviceExtensions) { + requiredExtensionsSet.erase(extension.extensionName); + } + + // Print missing required extensions + if (!requiredExtensionsSet.empty()) { + std::cout << "Missing required extensions:" << std::endl; + for (const auto& extension : requiredExtensionsSet) { + std::cout << " " << extension << std::endl; + } + return false; + } + + return true; +} + +// Check if device is suitable +bool Renderer::isDeviceSuitable(vk::raii::PhysicalDevice& device) { + // Check queue families + QueueFamilyIndices indices = findQueueFamilies(device); + + // Check device extensions + bool extensionsSupported = checkDeviceExtensionSupport(device); + + // Check swap chain support + bool swapChainAdequate = false; + if (extensionsSupported) { + SwapChainSupportDetails swapChainSupport = querySwapChainSupport(device); + swapChainAdequate = !swapChainSupport.formats.empty() && !swapChainSupport.presentModes.empty(); + } + + // Check for required features + auto features = device.template getFeatures2(); + bool supportsRequiredFeatures = features.template get().dynamicRendering; + + return indices.isComplete() && extensionsSupported && swapChainAdequate && supportsRequiredFeatures; +} + +// Choose swap surface format +vk::SurfaceFormatKHR Renderer::chooseSwapSurfaceFormat(const std::vector& availableFormats) { + if (availableFormats.empty()) { + throw std::runtime_error("No surface formats available"); + } + // Look for SRGB format + for (const auto& availableFormat : availableFormats) { + if (availableFormat.format == vk::Format::eB8G8R8A8Srgb && availableFormat.colorSpace == vk::ColorSpaceKHR::eSrgbNonlinear) { + return availableFormat; + } + } + + // If not found, return first available format + return availableFormats[0]; +} + +// Choose swap present mode +vk::PresentModeKHR Renderer::chooseSwapPresentMode(const std::vector& availablePresentModes) { + if (availablePresentModes.empty()) { + return vk::PresentModeKHR::eFifo; + } + // Look for mailbox mode (triple buffering) + for (const auto& availablePresentMode : availablePresentModes) { + if (availablePresentMode == vk::PresentModeKHR::eMailbox) { + return availablePresentMode; + } + } + + // If not found, return FIFO mode (guaranteed to be available) + return vk::PresentModeKHR::eFifo; +} + +// Choose swap extent +vk::Extent2D Renderer::chooseSwapExtent(const vk::SurfaceCapabilitiesKHR& capabilities) { + + if (capabilities.currentExtent.width != std::numeric_limits::max()) { + return capabilities.currentExtent; + } else { + // Get framebuffer size + if (!platform) { + std::cerr << "CRITICAL: platform pointer is NULL in chooseSwapExtent!" << std::endl; + throw std::runtime_error("platform pointer is NULL"); + } + int width, height; + platform->GetWindowSize(&width, &height); + + // Create extent + uint32_t w = static_cast(width); + uint32_t h = static_cast(height); + + // Manual clamp to avoid std::clamp issues if any + if (w < capabilities.minImageExtent.width) w = capabilities.minImageExtent.width; + if (w > capabilities.maxImageExtent.width) w = capabilities.maxImageExtent.width; + if (h < capabilities.minImageExtent.height) h = capabilities.minImageExtent.height; + if (h > capabilities.maxImageExtent.height) h = capabilities.maxImageExtent.height; + + return vk::Extent2D{w, h}; + } +} + +// Wait for device to be idle +void Renderer::WaitIdle() { + // 1. Wait for all in-flight fences safely first + std::vector allFences; + allFences.reserve(inFlightFences.size()); + for (const auto& fence : inFlightFences) { + if (*fence) { + allFences.push_back(*fence); + } + } + if (!allFences.empty()) { + (void) waitForFencesSafe(allFences, VK_TRUE); + } + + // 2. Also wait for uploads and frame timeline semaphores if they exist + auto waitTimeline = [&](vk::raii::Semaphore& sem, std::atomic& nextValAtomic) { + if (*sem) { + uint64_t target = nextValAtomic.load(std::memory_order_relaxed); + if (target > 0) { + while (true) { + vk::SemaphoreWaitInfo waitInfo{}; + waitInfo.semaphoreCount = 1; + waitInfo.pSemaphores = &*sem; + waitInfo.pValues = ⌖ + + vk::Result r = device.waitSemaphores(waitInfo, 100'000'000ULL); // 100ms + if (r == vk::Result::eSuccess) + break; + if (r == vk::Result::eTimeout) { + continue; + } + break; // Other error + } + } + } + }; + + waitTimeline(uploadsTimeline, nextUploadTimelineValue); + waitTimeline(frameTimeline, nextFrameTimelineValue); + + // 3. Final blocking wait to ensure absolute idle + // External synchronization: ensure no queue submits/presents overlap a full device idle. + // This is required for VVL cleanliness when other threads may hold or use queues. + std::lock_guard lock(queueMutex); + device.waitIdle(); +} + +vk::Result Renderer::waitForFencesSafe(const std::vector& fences, vk::Bool32 waitAll, uint64_t timeoutNs) { + if (fences.empty()) + return vk::Result::eSuccess; + + while (true) { + vk::Result r = device.waitForFences(fences, waitAll, timeoutNs); + if (r == vk::Result::eSuccess) + return vk::Result::eSuccess; + if (r == vk::Result::eTimeout) { + // We do NOT kick the watchdog here. If the fence wait takes 10+ seconds, + // it means the GPU has legitimately hung or stalled, and we WANT the + // watchdog to abort and report it. + continue; + } + return r; + } +} + +vk::Result Renderer::waitForFencesSafe(vk::Fence fence, vk::Bool32 waitAll, uint64_t timeoutNs) { + return waitForFencesSafe(std::vector{fence}, waitAll, timeoutNs); +} diff --git a/attachments/sync2_engine/scene_loading.cpp b/attachments/sync2_engine/scene_loading.cpp new file mode 100644 index 00000000..03190157 --- /dev/null +++ b/attachments/sync2_engine/scene_loading.cpp @@ -0,0 +1,607 @@ +/* Copyright (c) 2025 Holochip Corporation + * + * SPDX-License-Identifier: Apache-2.0 + * + * Licensed under the Apache License, Version 2.0 the "License"; + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +#include "scene_loading.h" +#include "animation_component.h" +#include "camera_component.h" +#include "engine.h" +#include "mesh_component.h" +#include "transform_component.h" +#include +#include +#include + +/** + * @brief Calculate bounding box dimensions for a MaterialMesh. + * @param materialMesh The MaterialMesh to analyze. + * @return The size of the bounding box (max - min for each axis). + */ +glm::vec3 CalculateBoundingBoxSize(const MaterialMesh& materialMesh) { + if (materialMesh.vertices.empty()) { + return glm::vec3(0.0f); + } + + glm::vec3 minBounds = materialMesh.vertices[0].position; + glm::vec3 maxBounds = materialMesh.vertices[0].position; + + for (const auto& vertex : materialMesh.vertices) { + minBounds = glm::min(minBounds, vertex.position); + maxBounds = glm::max(maxBounds, vertex.position); + } + + return maxBounds - minBounds; +} + +/** + * @brief Load a GLTF model synchronously on the main thread. + * @return success or fail on loading the GLTF model. + * @param engine The engine to create entities in. + * @param modelPath The path to the GLTF model file. + * @param position The position to place the model (default: origin with slight Y offset). + * @param rotation The rotation to apply to the model (default: no rotation). + * @param scale The scale to apply to the model (default: unit scale). + */ +bool LoadGLTFModel(Engine* engine, + const std::string& modelPath, + const glm::vec3& position, + const glm::vec3& rotation, + const glm::vec3& scale) { + // Get the model loader and renderer + ModelLoader* modelLoader = engine->GetModelLoader(); + Renderer* renderer = engine->GetRenderer(); + + if (!modelLoader || !renderer) { + std::cerr << "Error: ModelLoader or Renderer is null" << std::endl; + if (renderer) { + renderer->SetLoading(false); + } + return false; + } + // Ensure loading flag is cleared on any exit from this function + struct LoadingGuard { + Renderer* r; + ~LoadingGuard() { + r->SetLoading(false); + } + } loadingGuard{renderer}; + + // Extract model name from file path for entity naming + std::filesystem::path modelFilePath(modelPath); + std::string modelName = modelFilePath.stem().string(); // Get filename without extension + + try { + const auto loadStart = std::chrono::steady_clock::now(); + std::cout << "[Loading] Begin: " << modelPath << std::endl; + + // Loading large scenes can produce tens of thousands of entities. + // Avoid per-entity stdout spam (very slow on Windows consoles) and instead + // keep counters + print occasional summaries. + size_t physicsBodiesQueued = 0; + size_t physicsBodiesSkipped = 0; + size_t physicsNoGeometry = 0; + auto maybeLogPhysicsProgress = [&]() { + const size_t total = physicsBodiesQueued + physicsBodiesSkipped + physicsNoGeometry; + // Log infrequently to keep visibility without tanking load time. + if (total > 0 && (total % 5000u) == 0u) { + std::cout << "[Loading] Physics bodies: queued=" << physicsBodiesQueued + << ", skipped=" << physicsBodiesSkipped + << ", noGeometry=" << physicsNoGeometry << std::endl; + } + }; + // Load the complete GLTF model with all textures and lighting on the main thread + Model* loadedModel = modelLoader->LoadGLTF(modelPath); + if (!loadedModel) { + std::cerr << "Failed to load GLTF model: " << modelPath << std::endl; + return false; + } + + // Async texture uploads will continue in the background. + // Removing the blocking wait allows the engine to start quickly. + // renderer->WaitForAllTextureTasks(); + + std::cout << "Successfully loaded GLTF model with all textures and lighting: " << modelPath << std::endl; + + // Extract lights from the model and transform them to world space + std::vector extractedLights = modelLoader->GetExtractedLights(modelPath); + + // Create a transformation matrix from position, rotation, and scale + glm::mat4 transformMatrix = glm::mat4(1.0f); + transformMatrix = glm::translate(transformMatrix, position); + transformMatrix = glm::rotate(transformMatrix, glm::radians(rotation.x), glm::vec3(1.0f, 0.0f, 0.0f)); + transformMatrix = glm::rotate(transformMatrix, glm::radians(rotation.y), glm::vec3(0.0f, 1.0f, 0.0f)); + transformMatrix = glm::rotate(transformMatrix, glm::radians(rotation.z), glm::vec3(0.0f, 0.0f, 1.0f)); + transformMatrix = glm::scale(transformMatrix, scale); + + // Transform all light positions from local model space to world space + // Also transform the light direction (for directional lights) + glm::mat3 normalMatrix = glm::mat3(glm::transpose(glm::inverse(transformMatrix))); + for (auto& light : extractedLights) { + glm::vec4 worldPos = transformMatrix * glm::vec4(light.position, 1.0f); + light.position = glm::vec3(worldPos); + light.direction = glm::normalize(normalMatrix * light.direction); + } + + renderer->SetStaticLights(extractedLights); + + // Extract and apply cameras from the GLTF model + const std::vector& cameras = loadedModel->GetCameras(); + if (!cameras.empty()) { + const CameraData& gltfCamera = cameras[0]; // Use the first camera + + // Find or create a camera entity to replace the default one + Entity* cameraEntity = engine->GetEntity("Camera"); + if (!cameraEntity) { + // Create a new camera entity if none exists + cameraEntity = engine->CreateEntity("Camera"); + if (cameraEntity) { + cameraEntity->AddComponent(); + cameraEntity->AddComponent(); + } + } + + if (cameraEntity) { + // Update the camera transform with GLTF data + auto* cameraTransform = cameraEntity->GetComponent(); + if (cameraTransform) { + // Apply the transformation matrix to the camera position + glm::vec4 worldPos = transformMatrix * glm::vec4(gltfCamera.position, 1.0f); + cameraTransform->SetPosition(glm::vec3(worldPos)); + + // Apply rotation from GLTF camera + glm::vec3 eulerAngles = glm::eulerAngles(gltfCamera.rotation); + cameraTransform->SetRotation(eulerAngles); + } + + // Update the camera component with GLTF properties + auto* camera = cameraEntity->GetComponent(); + if (camera) { + camera->ForceViewMatrixUpdate(); // Only sets viewMatrixDirty flag, doesn't change camera orientation + if (gltfCamera.isPerspective) { + camera->SetFieldOfView(glm::degrees(gltfCamera.fov)); // Convert radians to degrees + camera->SetClipPlanes(gltfCamera.nearPlane, gltfCamera.farPlane); + if (gltfCamera.aspectRatio > 0.0f) { + camera->SetAspectRatio(gltfCamera.aspectRatio); + } + } else { + // Handle orthographic camera if needed + camera->SetProjectionType(CameraComponent::ProjectionType::Orthographic); + camera->SetOrthographicSize(gltfCamera.orthographicSize, gltfCamera.orthographicSize); + camera->SetClipPlanes(gltfCamera.nearPlane, gltfCamera.farPlane); + } + + // Set this as the active camera + engine->SetActiveCamera(camera); + } + } + } + + // Get the material meshes from the loaded model + const std::vector& materialMeshes = modelLoader->GetMaterialMeshes(modelPath); + if (materialMeshes.empty()) { + std::cerr << "No material meshes found in loaded model: " << modelPath << std::endl; + return false; + } + + // Collect all geometry entities so we can batch Vulkan uploads for their meshes + std::vector geometryEntities; + geometryEntities.reserve(materialMeshes.size()); + + // Phase: Physics (queue colliders / rigid bodies). This is CPU-side work that can + // take noticeable time even after textures have finished scheduling. + renderer->SetLoadingPhase(Renderer::LoadingPhase::Physics); + renderer->SetLoadingPhaseProgress(0.0f); + renderer->AddLoadingWorkItems(materialMeshes.size()); + + for (size_t meshIdx = 0; meshIdx < materialMeshes.size(); ++meshIdx) { + const auto& materialMesh = materialMeshes[meshIdx]; + if ((meshIdx % 64u) == 0u) { + renderer->SetLoadingPhaseProgress(materialMeshes.empty() ? 0.0f : (static_cast(meshIdx) / static_cast(materialMeshes.size()))); + // Yield to main thread and kick watchdog to ensure responsiveness + renderer->KickWatchdog(); + std::this_thread::sleep_for(std::chrono::milliseconds(1)); + } + renderer->CompleteLoadingWorkItems(1); + // Create an entity name based on model and material + std::string entityName = modelName + "_Material_" + std::to_string(materialMesh.materialIndex) + + "_" + materialMesh.materialName; + + if (Entity* materialEntity = engine->CreateEntity(entityName)) { + // Add a transform component with provided parameters + auto* transform = materialEntity->AddComponent(); + transform->SetPosition(position); + transform->SetRotation(glm::radians(rotation)); + transform->SetScale(scale); + + // Add a mesh component with material-specific data + auto* mesh = materialEntity->AddComponent(); + mesh->SetVertices(materialMesh.vertices); + mesh->SetIndices(materialMesh.indices); + + if (materialMesh.GetInstanceCount() > 0) { + mesh->SetInstances(materialMesh.instances); + } + + // Set ALL PBR texture paths for this material + // Set primary texture path for backward compatibility + if (!materialMesh.texturePath.empty()) { + mesh->SetTexturePath(materialMesh.texturePath); + } + + // Set all PBR texture paths + if (!materialMesh.baseColorTexturePath.empty()) { + mesh->SetBaseColorTexturePath(materialMesh.baseColorTexturePath); + } + if (!materialMesh.normalTexturePath.empty()) { + mesh->SetNormalTexturePath(materialMesh.normalTexturePath); + } + if (!materialMesh.metallicRoughnessTexturePath.empty()) { + mesh->SetMetallicRoughnessTexturePath(materialMesh.metallicRoughnessTexturePath); + } + if (!materialMesh.occlusionTexturePath.empty()) { + mesh->SetOcclusionTexturePath(materialMesh.occlusionTexturePath); + } + if (!materialMesh.emissiveTexturePath.empty()) { + mesh->SetEmissiveTexturePath(materialMesh.emissiveTexturePath); + } + + // Fallback: Use material DB (from ModelLoader) if any PBR texture is still missing + if (modelLoader) { + const Material* mat = modelLoader->GetMaterial(materialMesh.materialName); + if (mat) { + if (mesh->GetBaseColorTexturePath().empty() && !mat->albedoTexturePath.empty()) { + mesh->SetBaseColorTexturePath(mat->albedoTexturePath); + } + if (mesh->GetNormalTexturePath().empty() && !mat->normalTexturePath.empty()) { + mesh->SetNormalTexturePath(mat->normalTexturePath); + } + if (mesh->GetMetallicRoughnessTexturePath().empty() && !mat->metallicRoughnessTexturePath.empty()) { + mesh->SetMetallicRoughnessTexturePath(mat->metallicRoughnessTexturePath); + } + if (mesh->GetOcclusionTexturePath().empty() && !mat->occlusionTexturePath.empty()) { + mesh->SetOcclusionTexturePath(mat->occlusionTexturePath); + } + if (mesh->GetEmissiveTexturePath().empty() && !mat->emissiveTexturePath.empty()) { + mesh->SetEmissiveTexturePath(mat->emissiveTexturePath); + } + } + } + + // Register all effective texture IDs this mesh uses so that when + // textures finish streaming in, the renderer can refresh + // descriptor sets for the appropriate entities. This must + // happen *after* material fallbacks so we see the final IDs. + auto registerTex = [&](const std::string& texId) { + if (!texId.empty()) { + renderer->RegisterTextureUser(texId, materialEntity); + } + }; + + registerTex(mesh->GetTexturePath()); + registerTex(mesh->GetBaseColorTexturePath()); + registerTex(mesh->GetNormalTexturePath()); + registerTex(mesh->GetMetallicRoughnessTexturePath()); + registerTex(mesh->GetOcclusionTexturePath()); + registerTex(mesh->GetEmissiveTexturePath()); + + // Track this entity for batched Vulkan resource pre-allocation later + geometryEntities.push_back(materialEntity); + + // Create physics body for collision with balls, but only for geometry + // that is reasonably close to the ground plane. This avoids creating + // expensive mesh colliders for high-up roofs and distant details. + PhysicsSystem* physicsSystem = engine->GetPhysicsSystem(); + if (physicsSystem) { + auto* mc = materialEntity->GetComponent(); + if (mc && !mc->GetVertices().empty() && !mc->GetIndices().empty()) { + // Compute a simple Y-range in WORLD space using the entity transform + // and the mesh's local AABB if available; otherwise approximate from vertices. + glm::vec3 minWS(std::numeric_limits::max()); + glm::vec3 maxWS(-std::numeric_limits::max()); + + auto* xform = materialEntity->GetComponent(); + glm::mat4 model = xform ? xform->GetModelMatrix() : glm::mat4(1.0f); + + if (mc->HasLocalAABB()) { + glm::vec3 localMin = mc->GetLocalAABBMin(); + glm::vec3 localMax = mc->GetLocalAABBMax(); + + // Transform the 8 corners of the local AABB to world space + for (int ix = 0; ix < 2; ++ix) { + for (int iy = 0; iy < 2; ++iy) { + for (int iz = 0; iz < 2; ++iz) { + glm::vec3 corner( + ix ? localMax.x : localMin.x, + iy ? localMax.y : localMin.y, + iz ? localMax.z : localMin.z); + glm::vec3 cWS = glm::vec3(model * glm::vec4(corner, 1.0f)); + minWS = glm::min(minWS, cWS); + maxWS = glm::max(maxWS, cWS); + } + } + } + } else { + // Fallback: compute bounds directly from vertices in world space + const auto& verts = mc->GetVertices(); + for (const auto& v : verts) { + glm::vec3 pWS = glm::vec3(model * glm::vec4(v.position, 1.0f)); + minWS = glm::min(minWS, pWS); + maxWS = glm::max(maxWS, pWS); + } + } + + // If we have a valid Y range and the mesh comes within 6 meters of the ground, + // create a physics body. Otherwise, skip it to save startup time and memory. + const float groundY = 0.0f; + const float maxDistanceFromGround = 6.0f; + bool nearGround = (minWS.y <= groundY + maxDistanceFromGround); + + if (nearGround) { + physicsSystem->EnqueueRigidBodyCreation( + materialEntity, + CollisionShape::Mesh, + 0.0f, + // mass 0 = static + true, + // kinematic + 0.15f, + // restitution + 0.5f // friction + ); + ++physicsBodiesQueued; + maybeLogPhysicsProgress(); + } else { + ++physicsBodiesSkipped; + maybeLogPhysicsProgress(); + } + } else { + ++physicsNoGeometry; + maybeLogPhysicsProgress(); + } + } + } else { + std::cerr << "Failed to create entity for material " << materialMesh.materialName << std::endl; + } + } + renderer->SetLoadingPhaseProgress(1.0f); + + // Pre-allocate Vulkan resources for all geometry entities in a single batched pass + if (!geometryEntities.empty()) { + // Scene loading runs on a background thread. Do NOT perform Vulkan allocations + // or mutate renderer resource maps here. Enqueue the batch so the render thread can + // perform the GPU work safely at its frame-start safe point. + renderer->AddLoadingWorkItems(geometryEntities.size()); + renderer->EnqueueEntityPreallocationBatch(geometryEntities); + } + + // Final loading summary (useful for profiling, low-noise) + std::cout << "[Loading] Physics bodies summary: queued=" << physicsBodiesQueued + << ", skipped=" << physicsBodiesSkipped + << ", noGeometry=" << physicsNoGeometry << std::endl; + + const auto loadEnd = std::chrono::steady_clock::now(); + const auto loadMs = std::chrono::duration_cast(loadEnd - loadStart).count(); + const auto loadSecs = static_cast(loadMs) / 1000.0; + const bool loadFastOk = loadSecs <= 60.0; + std::cout << "[Loading] End: " << modelPath << " in " << loadSecs << "s" << (loadFastOk ? "" : " (SLOW)") << std::endl; + + // Set up animations if the model has any + const std::vector& animations = loadedModel->GetAnimations(); + std::cout << "[Animation] Model has " << animations.size() << " animation(s)" << std::flush << std::endl; + if (!animations.empty()) { + std::cout << "[Animation] Setting up " << animations.size() << " animation(s) for playback" << std::flush << std::endl; + + // Create an animation controller entity + Entity* animController = engine->CreateEntity(modelName + "_AnimController"); + if (animController) { + auto* animTransform = animController->AddComponent(); + animTransform->SetPosition(position); + + auto* animComponent = animController->AddComponent(); + animComponent->SetAnimations(animations); + + // Build node-to-entity mapping using actual glTF node indices + // Get animated node mesh mappings to link geometry entities to animated nodes + const auto& animatedNodeMeshes = loadedModel->GetAnimatedNodeMeshes(); + + // Get the base transforms for animated nodes + const auto& animatedNodeTransforms = loadedModel->GetAnimatedNodeTransforms(); + + std::cout << "[Animation] Processing " << animatedNodeMeshes.size() << " animated nodes" << std::endl; + renderer->AddLoadingWorkItems(animatedNodeMeshes.size()); + + // Build nodeToEntity mapping by creating or finding entities for each animated node + std::unordered_map nodeToEntity; + std::unordered_map meshUsageCount; // Track how many times each mesh is used + + // First pass: count how many animated nodes use each mesh + for (const auto& [nodeIndex, meshIndex] : animatedNodeMeshes) { + meshUsageCount[meshIndex]++; + } + + // Second pass: create entities for animated nodes + for (const auto& [nodeIndex, meshIndex] : animatedNodeMeshes) { + std::cout << "[Animation] Processing animated node " << nodeIndex << " with mesh " << meshIndex << std::endl; + + // Find a MaterialMesh with this sourceMeshIndex + const MaterialMesh* sourceMaterialMesh = nullptr; + size_t sourceMaterialMeshIdx = 0; + for (size_t i = 0; i < materialMeshes.size(); ++i) { + if (materialMeshes[i].sourceMeshIndex == meshIndex) { + sourceMaterialMesh = &materialMeshes[i]; + sourceMaterialMeshIdx = i; + break; + } + } + + renderer->CompleteLoadingWorkItems(1); + if (!sourceMaterialMesh) { + std::cerr << "[Animation] WARNING: No MaterialMesh found for animated node " + << nodeIndex << " (mesh " << meshIndex << ")" << std::endl; + continue; + } + + Entity* nodeEntity = nullptr; + + // If this is the first animated node using this mesh, use the existing entity + // For subsequent nodes, create new entities + bool isFirstUse = (nodeToEntity.empty() || + std::none_of(nodeToEntity.begin(), + nodeToEntity.end(), + [meshIndex, &animatedNodeMeshes](const auto& pair) { + auto it = animatedNodeMeshes.find(pair.first); + return it != animatedNodeMeshes.end() && it->second == meshIndex; + })); + + if (isFirstUse && sourceMaterialMeshIdx < geometryEntities.size()) { + // Reuse existing entity for first animated node with this mesh + nodeEntity = geometryEntities[sourceMaterialMeshIdx]; + std::cout << "[Animation] Reusing existing entity for first node " << nodeIndex << std::endl; + + // CRITICAL: Clear any instance data from the reused entity + // If this mesh was set up for instanced rendering, we need to convert it + // to a single non-instanced entity for animation + auto* mesh = nodeEntity->GetComponent(); + if (mesh && mesh->GetInstanceCount() > 0) { + size_t instanceCount = mesh->GetInstanceCount(); + mesh->ClearInstances(); + std::cout << "[Animation] Cleared " << instanceCount + << " instances from reused entity for animation" << std::endl; + + // Recreate the GPU instance buffer with a single identity instance + // The old buffer still had multiple instances, so we need to update it + renderer->EnqueueInstanceBufferRecreation(nodeEntity); + } + } else { + // Create a new entity for this animated node (duplicate geometry) + std::string entityName = modelName + "_AnimNode_" + std::to_string(nodeIndex) + + "_Material_" + std::to_string(sourceMaterialMesh->materialIndex); + nodeEntity = engine->CreateEntity(entityName); + + if (nodeEntity) { + // Add transform component (will be set below) + nodeEntity->AddComponent(); + + // Clone the mesh component from the source MaterialMesh + auto* mesh = nodeEntity->AddComponent(); + mesh->SetVertices(sourceMaterialMesh->vertices); + mesh->SetIndices(sourceMaterialMesh->indices); + + // Copy all texture paths + if (!sourceMaterialMesh->baseColorTexturePath.empty()) + mesh->SetBaseColorTexturePath(sourceMaterialMesh->baseColorTexturePath); + if (!sourceMaterialMesh->normalTexturePath.empty()) + mesh->SetNormalTexturePath(sourceMaterialMesh->normalTexturePath); + if (!sourceMaterialMesh->metallicRoughnessTexturePath.empty()) + mesh->SetMetallicRoughnessTexturePath(sourceMaterialMesh->metallicRoughnessTexturePath); + if (!sourceMaterialMesh->occlusionTexturePath.empty()) + mesh->SetOcclusionTexturePath(sourceMaterialMesh->occlusionTexturePath); + if (!sourceMaterialMesh->emissiveTexturePath.empty()) + mesh->SetEmissiveTexturePath(sourceMaterialMesh->emissiveTexturePath); + + // Register textures with renderer + auto registerTex = [&](const std::string& texId) { + if (!texId.empty()) + renderer->RegisterTextureUser(texId, nodeEntity); + }; + registerTex(mesh->GetBaseColorTexturePath()); + registerTex(mesh->GetNormalTexturePath()); + registerTex(mesh->GetMetallicRoughnessTexturePath()); + registerTex(mesh->GetOcclusionTexturePath()); + registerTex(mesh->GetEmissiveTexturePath()); + + // Pre-allocate resources for this new entity + renderer->AddLoadingWorkItems(1); + renderer->EnqueueEntityPreallocationBatch({nodeEntity}); + + std::cout << "[Animation] Created new entity '" << entityName << "' for node " << nodeIndex << std::endl; + } + } + + if (nodeEntity) { + // Apply the base transform from the glTF node to this entity + auto transformIt = animatedNodeTransforms.find(nodeIndex); + if (transformIt != animatedNodeTransforms.end()) { + const glm::mat4& nodeTransform = transformIt->second; + + // Decompose the matrix into position, rotation, and scale + glm::vec3 nodePosition, nodeScale, skew; + glm::quat nodeRotation; + glm::vec4 perspective; + glm::decompose(nodeTransform, nodeScale, nodeRotation, nodePosition, skew, perspective); + + // Apply the node's local transform to the entity + auto* transform = nodeEntity->GetComponent(); + if (transform) { + transform->SetPosition(nodePosition); + transform->SetRotation(glm::eulerAngles(nodeRotation)); + transform->SetScale(nodeScale); + std::cout << "[Animation] Applied base transform to entity '" << nodeEntity->GetName() + << "' - pos(" << nodePosition.x << "," << nodePosition.y << "," << nodePosition.z << ")" << std::endl; + } + } + + nodeToEntity[nodeIndex] = nodeEntity; + std::cout << "[Animation] Linked entity '" << nodeEntity->GetName() + << "' to animated node " << nodeIndex << std::endl; + } + } + + animComponent->SetNodeToEntityMap(nodeToEntity); + + std::cout << "[Animation] Node-to-entity mapping has " << nodeToEntity.size() + << " entries (of " << animatedNodeMeshes.size() << " animated nodes)" << std::endl; + + // Auto-play the first animation + if (!animations.empty()) { + animComponent->Play(0, true); // Play first animation, looping + std::cout << "Auto-playing animation: " << animations[0].name + << " (duration: " << animations[0].GetDuration() << "s)" << std::endl; + } + } + } + } catch (const std::exception& e) { + std::cerr << "Error loading GLTF model: " << e.what() << std::endl; + return false; + } + + // Scene parsing is complete. Transition to Preallocating state so the render thread + // can process the entity geometry queue before moving to PhysicsInit and Play. + std::cout << "[Loading] Scene parsing complete. Transitioning to Preallocating." << std::endl; + renderer->SetInternalLoadingState(Renderer::InternalLoadingState::Preallocating); + // renderer->SetLoadingPhase(Renderer::LoadingPhase::Scene); + renderer->SetLoadingPhaseProgress(1.0f); + + // Request acceleration structure build for later (will be gated by state machine in Render) + if (renderer->GetRayQueryEnabled() && renderer->GetAccelerationStructureEnabled()) { + renderer->RequestAccelerationStructureBuild("Scene loading complete"); + } + + // Note: Entity preallocations are drained on the render thread after loading completes. + // The renderer will finalize the loading state (MarkInitialLoadComplete) once + // all geometry resources are created. + + return true; +} + +/** + * @brief Load a GLTF model with default transform values. + * @param engine The engine to create entities in. + * @param modelPath The path to the GLTF model file. + */ +void LoadGLTFModel(Engine* engine, const std::string& modelPath) { + // Use default transform values: slight Y offset, no rotation, unit scale + LoadGLTFModel(engine, modelPath, glm::vec3(0.0f, 0.0f, 0.0f), glm::vec3(0.0f, 0.0f, 0.0f), glm::vec3(1.0f, 1.0f, 1.0f)); +} diff --git a/en/Synchronization/Anatomy_of_a_Dependency/01_introduction.adoc b/en/Synchronization/Anatomy_of_a_Dependency/01_introduction.adoc new file mode 100644 index 00000000..57f17477 --- /dev/null +++ b/en/Synchronization/Anatomy_of_a_Dependency/01_introduction.adoc @@ -0,0 +1,25 @@ +:pp: {plus}{plus} += Anatomy of a Dependency: Introduction + +== Overview + +Every Vulkan operation, from a simple color clear to a complex ray-traced reflections pass, lives and breathes by the dependencies we define. In this chapter, we take a deep dive into the core mechanics of how data actually moves through the Vulkan pipeline and why synchronization is about much more than just "setting a bitmask." + +image::/images/rendering_pipeline_flowchart.png[Rendering Pipeline Flowchart, width=600, alt="Flowchart showing the stages of a modern Vulkan rendering pipeline"] + +To truly master synchronization, we first need to break down what happens when the GPU processes your commands. We often talk about the GPU as a "massive parallel processor," but what does that mean for data integrity? We'll start by deconstructing the fundamental differences between **Execution Dependencies** (the "when" of GPU work) and **Memory Dependencies** (the "where" and "visibility" of data). + +=== What You'll Learn in This Chapter + +This chapter is designed to move you from "making it work" to "knowing why it works." We'll explore: + +* **The Hardware Perspective**: Understanding why execution barriers alone are not enough to prevent data corruption on modern, multi-cache GPUs. +* **Execution vs. Memory Dependencies**: Learning how to distinguish between stopping a stage and ensuring its data is actually readable by the next one. +* **The Synchronization 2 Advantage**: Why the new `vk::DependencyInfo` and `vk::CmdPipelineBarrier2` are more than just a syntax cleanup—they are a fundamental shift in how we express intent to the driver. +* **Surgical Precision with Pipeline Stages**: Mastering `vk::PipelineStageFlagBits2` and `vk::AccessFlagBits2` to target specific hardware units, ensuring maximum GPU occupancy by avoiding unnecessary pipeline bubbles. + +By the end of this chapter, you’ll have a clear understanding of the "handshake" that must occur between any two pieces of GPU work. This foundation is crucial for everything that follows, from simple image layout transitions to complex asynchronous compute architectures. + +== Navigation + +Previous: xref:Synchronization/introduction.adoc[Introduction] | Next: xref:Synchronization/Anatomy_of_a_Dependency/02_execution_vs_memory.adoc[Execution vs. Memory Dependencies] diff --git a/en/Synchronization/Anatomy_of_a_Dependency/02_execution_vs_memory.adoc b/en/Synchronization/Anatomy_of_a_Dependency/02_execution_vs_memory.adoc new file mode 100644 index 00000000..c4821714 --- /dev/null +++ b/en/Synchronization/Anatomy_of_a_Dependency/02_execution_vs_memory.adoc @@ -0,0 +1,61 @@ +:pp: {plus}{plus} += Execution vs. Memory Dependencies + +== Introduction + +To understand why synchronization is so critical, we first need to look at what's happening under the hood when a GPU processes your work. Unlike a CPU, which generally executes instructions in a linear, predictable fashion, the GPU is a massive, highly-parallel array of specialized hardware units. When you submit a command buffer, the GPU doesn't just start at the top and finish at the bottom; it distributes tasks across various stages of its pipeline—geometry, rasterization, fragment shading, and more—often all at once. + +This parallelism is what makes Vulkan powerful, but it's also where the danger lies. If you want a fragment shader to read data that was just written by a compute shader, you must define exactly how that dependency works. In Vulkan, this is split into two distinct concepts: **Execution Dependencies** and **Memory Dependencies**. + +=== The "When": Execution Dependencies + +An **Execution Dependency** is the simplest form of synchronization. It answers the question: "When can this work start?" + +Imagine you have two commands: Command A and Command B. An execution dependency from A to B simply tells the GPU: "Don't start the specified pipeline stages of Command B until the specified pipeline stages of Command A have finished." + +This sounds straightforward, but here's the catch: on modern hardware, Command A finishing its work is *not* the same thing as its data being ready for Command B. Execution is just the trigger; memory is the substance. + +=== Architectural Realities: Caches and Memory Types + +Vulkan memory isn't just one big bucket where you store textures and buffers. Depending on your hardware, it's a complex landscape of different physical locations and access speeds. To sync effectively, you need to know what you're syncing against. + +On a **Discrete GPU**, you have dedicated Video RAM (VRAM) that is physically separate from your system's RAM. Moving data between these two is the job of the **DMA (Direct Memory Access)** engine—a specialized unit that can copy data across the PCI Express bus without bothering the main shader cores. When you upload a texture, you're often syncing the DMA engine with the Graphics pipeline. + +On the other hand, many laptops and mobile devices use **Unified Memory Architecture (UMA)**, where the CPU and GPU share the same physical RAM sticks. While this sounds like it should make things easier, it actually adds a hidden layer of complexity: **Caches**. Even if they share the RAM, the CPU has its own L1/L2/L3 caches, and the GPU has its own L1/L2 caches. If the GPU writes data to a shared buffer, that data might stay in the GPU's L2 cache and never actually reach the physical RAM. When the CPU tries to read it, it will see the old, stale value from the RAM or its own cache. + +In Vulkan, we categorize these behaviors into three primary memory types: + +* **Device Local**: This is memory that is "fastest" for the GPU to access. On a discrete card, this is the VRAM. On UMA, it's just a portion of the shared RAM. +* **Host Visible**: This memory can be "mapped" into your c{pp} application's address space, allowing the CPU to read and write to it directly. +* **Host Coherent**: A special type of Host Visible memory where the hardware automatically ensures that CPU and GPU see the same data without you needing to manually flush caches (though you still need an execution dependency to ensure the write has *finished*!). + +=== The "Where": Memory Dependencies + +This is where many Vulkan developers get caught. Even if Command A has finished, its output might still be sitting in a local L1 cache on a specific shader core, or it might be in a shared L2 cache that hasn't been written back to the main pool. If Command B—perhaps running on a completely different part of the GPU or even the CPU—tries to read that data from main memory before it has been "made available," it will read stale data. + +This is why we say execution is not enough. You can tell the hardware "Wait for the Compute Shader to finish before starting the Fragment Shader," and the hardware will happily oblige. But the Fragment Shader will then go to read the texture and find the old data because the Compute Shader's writes are still trapped in a local cache somewhere. + +A **Memory Dependency** ensures that data is properly moved between caches and main memory so it can be safely read. This involves two critical steps: + +1. **Availability**: This operation "flushes" the data from the source's local caches so that it is visible to a shared memory pool (like L2 cache or main memory). +2. **Visibility**: This operation "invalidates" the local caches of the destination stage, forcing it to read the fresh data from the shared memory pool rather than using whatever stale bits it might already have. + +Without both an execution dependency AND a memory dependency, you are living in a world of **hazards**. The most common is the "Read-After-Write" (RAW) hazard, where your fragment shader reads a texture before the compute shader has finished writing to it, resulting in the flickering artifacts or "shadow acne" that are so common in early Vulkan implementations. + +=== The Practical Handshake + +Think of it as a professional handshake. An execution dependency is the two people agreeing to meet. A memory dependency is one person actually handing the document to the other and the other person making sure they are looking at the new document, not their old notes. + +In Synchronization 2, we define this handshake using `vk::PipelineStageFlagBits2` and `vk::AccessFlagBits2`. The stage flags define the *when* (the execution dependency), and the access flags define the *how* (the memory dependency). By pairing these correctly, you ensure that your data is not only processed in the right order but is also actually there when you go to look for it. + +== Simple Engine Implementation: Caches and Safety + +In `Simple Engine`, we handle these architectural realities through our `MemoryPool` class (`memory_pool.cpp`). When we allocate memory for a buffer or image, we specify the `vk::MemoryPropertyFlags` to decide its role. For example, our `UniformBuffer` objects are typically allocated as `HostVisible | HostCoherent`. This means the CPU can write to them and they are automatically visible to the GPU without a manual `flushMappedMemoryRanges` call. + +However, just because they are **coherent** doesn't mean we can ignore execution dependencies! Even in `Simple Engine`, if the CPU updates a `HostCoherent` uniform buffer while the GPU is in the middle of a fragment shader reading from it, we will encounter a **data race**. This is why we still use `inFlightFences` and semaphores to ensure the GPU has finished using a frame's resources before the CPU starts modifying them for the next frame. + +For our textures and vertex buffers, we use `DeviceLocal` memory for maximum performance. Because these are not host-coherent, we must use `vk::DependencyInfo` and `vk::ImageMemoryBarrier2` to explicitly manage the "Availability" and "Visibility" handshakes. This ensures that after a `vkCmdCopyBufferToImage` command, the data is properly flushed from the transfer unit's caches and invalidated for the fragment shader's caches. + +== Navigation + +Previous: xref:Synchronization/Anatomy_of_a_Dependency/01_introduction.adoc[Introduction] | Next: xref:Synchronization/Anatomy_of_a_Dependency/03_sync2_advantage.adoc[The Synchronization 2 Advantage] diff --git a/en/Synchronization/Anatomy_of_a_Dependency/03_sync2_advantage.adoc b/en/Synchronization/Anatomy_of_a_Dependency/03_sync2_advantage.adoc new file mode 100644 index 00000000..f655f813 --- /dev/null +++ b/en/Synchronization/Anatomy_of_a_Dependency/03_sync2_advantage.adoc @@ -0,0 +1,195 @@ +:pp: {plus}{plus} += The Synchronization 2 Advantage + +== Introduction + +In the early days of Vulkan 1.0, defining dependencies was a fragmented and often frustrating process. You had to juggle multiple structures like `VkMemoryBarrier`, `VkBufferMemoryBarrier`, and `VkImageMemoryBarrier`. These structures weren't just numerous; they were also functionally separate, which meant the logic of your synchronization was spread across several different parts of your code. + +**Synchronization 2** (VK_KHR_synchronization2), which is now core in Vulkan 1.3, fundamentally changes this. It unifies these disparate barriers into a single, cohesive structure: `vk::DependencyInfo`. + +== The Fragmented Past: Why Legacy Was Hard + +To appreciate the advantage of Synchronization 2, we have to look at what we're leaving behind. In the legacy Vulkan 1.0 API, a pipeline barrier was a single function call that took three separate arrays of barriers: global, buffer, and image. + +[,cpp] +---- +// Legacy Vulkan 1.0 (Still works, but we don't like it) +vkCmdPipelineBarrier( + commandBuffer, + srcStageMask, // Global stage mask for ALL barriers + dstStageMask, // Global stage mask for ALL barriers + dependencyFlags, + memoryBarrierCount, pMemoryBarriers, + bufferMemoryBarrierCount, pBufferMemoryBarriers, + imageMemoryBarrierCount, pImageMemoryBarriers +); +---- + +Notice the problem? The `srcStageMask` and `dstStageMask` were passed as arguments to the *function*, not as part of the individual barrier structures. This meant that if you had two different image transitions in the same call—say, one from `Transfer` to `Fragment Shader` and another from `Compute` to `Vertex Shader`—you had to combine all those stages into a single, broad mask. + +This led to "over-synchronization." By merging the stages at the function level, you were inadvertently telling the GPU to wait for *all* the source stages to finish before *any* of the destination stages could start. You were creating a bottleneck where one didn't need to exist. + +image::/images/sync2_problem_over_sync.svg[Legacy Synchronization Log Jam, width=600, align="center"] + +== The "Chain of Intent": Unification with vk::DependencyInfo + +Synchronization 2 solves this by moving the stage masks into the barrier structures themselves. In our engine, we use `vk::DependencyInfo`, which acts as a container for all our synchronization needs. + +When we unify synchronization, we aren't just cleaning up the syntax. We are grouping the entire "intent" of the dependency in one place. With `vk::DependencyInfo`, each individual `vk::ImageMemoryBarrier2` or `vk::BufferMemoryBarrier2` contains its own `srcStageMask` and `dstStageMask`. + +[,cpp] +---- +// Synchronization 2 (The Modern Way) +vk::ImageMemoryBarrier2 imageBarrier{ + .srcStageMask = vk::PipelineStageFlagBits2::eColorAttachmentOutput, + .srcAccessMask = vk::AccessFlagBits2::eColorAttachmentWrite, + .dstStageMask = vk::PipelineStageFlagBits2::eFragmentShader, + .dstAccessMask = vk::AccessFlagBits2::eShaderRead, + // ... layout transitions, image handles ... +}; + +vk::DependencyInfo dependencyInfo{ + .imageMemoryBarrierCount = 1, + .pImageMemoryBarriers = &imageBarrier +}; + +commandBuffer.pipelineBarrier2(dependencyInfo); +---- + +image::/images/sync2_solution_granular.svg[Synchronization 2 Granular Control, width=600, align="center"] + +This is a massive win for human readability. When you look at that `imageBarrier` block, you see a complete "handshake." You know exactly what work is finishing (`src`) and exactly what work is waiting (`dst`). There's no need to hunt through function arguments or global variables to find the other half of the dependency. + +== Granular Control with 64-bit Masks + +Another technical reason for the switch was simple math. The original Vulkan 1.0 flags were 32-bit bitmasks. As Vulkan evolved, we added Ray Tracing, Mesh Shading, Video Encoding, and more. We were literally running out of bits. + +With `vk::PipelineStageFlagBits2`, we've moved to 64-bit masks. This gives us the headroom to target specific hardware units with surgical precision. + +=== The Power of "None" +In legacy Vulkan, if you didn't need a memory dependency (just an execution one), you often had to pass `0` or use confusing flags like `BOTTOM_OF_PIPE`. In Sync 2, we have an explicit `vk::PipelineStageFlagBits2::eNone` and `vk::AccessFlagBits2::eNone`. + +If you're doing a layout transition that doesn't require a memory flush (very rare, but possible), or if you just want to be absolutely clear that a certain barrier has no effect on a specific stage, `eNone` is your best friend. It makes the code self-documenting. + +== Sync 2 as a Mental Model + +Think of Synchronization 2 not as a new API, but as a better way to talk to the GPU. In the old system, you were shouting broad commands at the hardware: "EVERYONE STOP UNTIL THE RENDERING IS DONE!" + +In Synchronization 2, you're having a more nuanced conversation: "Hey, Color Attachment Output, once you're done writing this specific image, let the Fragment Shader know it's safe to start reading it." + +This "human-to-human" level of clarity is why we've built our entire engine around these structures. It reduces the cognitive load on you, the developer, and it gives the driver the exact information it needs to keep the GPU's "pipeline" full of work. + +== Putting it Together in the Engine + +In a real-world engine, synchronization isn't just about single transitions; it's about orchestrating the entire flow of data between passes. A perfect example of the Synchronization 2 "win" can be found in our `Renderer` class, specifically during a complex operation like a reflection pass. + +When rendering reflections, we often need to transition multiple resources at once—for example, a color buffer and a depth buffer. In the legacy API, these would be forced into a "log jam" where both would have to wait for the union of all stages. With Synchronization 2, we can batch them while maintaining their unique requirements. + +[,cpp] +---- +void Renderer::renderReflectionPass(vk::raii::CommandBuffer& cmd) { + // Transition reflection color to COLOR_ATTACHMENT_OPTIMAL + vk::ImageMemoryBarrier2 toColor{ + .srcStageMask = vk::PipelineStageFlagBits2::eColorAttachmentOutput, + .srcAccessMask = vk::AccessFlagBits2::eColorAttachmentWrite, + .dstStageMask = vk::PipelineStageFlagBits2::eColorAttachmentOutput, + .dstAccessMask = vk::AccessFlagBits2::eColorAttachmentWrite, + .oldLayout = vk::ImageLayout::eShaderReadOnlyOptimal, + .newLayout = vk::ImageLayout::eColorAttachmentOptimal, + .image = *reflectionColor, + .subresourceRange = { vk::ImageAspectFlagBits::eColor, 0, 1, 0, 1 } + }; + + // Transition reflection depth to DEPTH_ATTACHMENT_OPTIMAL + vk::ImageMemoryBarrier2 toDepth{ + .srcStageMask = vk::PipelineStageFlagBits2::eEarlyFragmentTests, + .srcAccessMask = vk::AccessFlagBits2::eDepthStencilAttachmentWrite, + .dstStageMask = vk::PipelineStageFlagBits2::eEarlyFragmentTests, + .dstAccessMask = vk::AccessFlagBits2::eDepthStencilAttachmentWrite, + .oldLayout = vk::ImageLayout::eUndefined, + .newLayout = vk::ImageLayout::eDepthAttachmentOptimal, + .image = *reflectionDepth, + .subresourceRange = { vk::ImageAspectFlagBits::eDepth, 0, 1, 0, 1 } + }; + + // The Win: Batching unique intents into a single call + std::array barriers{ toColor, toDepth }; + vk::DependencyInfo dependencyInfo{ + .imageMemoryBarrierCount = static_cast(barriers.size()), + .pImageMemoryBarriers = barriers.data() + }; + + // One call, but the driver knows 'toColor' only cares about + // ColorAttachmentOutput, while 'toDepth' only cares about EarlyFragmentTests. + cmd.pipelineBarrier2(dependencyInfo); + + // Now we can safely begin our reflection rendering + // ... +} +---- + +To see the "win" here, let's look at the legacy alternative for that same `renderReflectionPass` operation. You would have been forced to combine your stage masks at the function level: + +[,cpp] +---- +// Legacy Vulkan 1.0 equivalent - The "Log Jam" +vk::ImageMemoryBarrier legacyToColor{ + .srcAccessMask = vk::AccessFlagBits::eColorAttachmentWrite, + .dstAccessMask = vk::AccessFlagBits::eColorAttachmentWrite, + .oldLayout = vk::ImageLayout::eShaderReadOnlyOptimal, + .newLayout = vk::ImageLayout::eColorAttachmentOptimal, + .image = *reflectionColor, + .subresourceRange = { vk::ImageAspectFlagBits::eColor, 0, 1, 0, 1 } +}; + +vk::ImageMemoryBarrier legacyToDepth{ + .srcAccessMask = vk::AccessFlagBits::eDepthStencilAttachmentWrite, + .dstAccessMask = vk::AccessFlagBits::eDepthStencilAttachmentWrite, + .oldLayout = vk::ImageLayout::eUndefined, + .newLayout = vk::ImageLayout::eDepthAttachmentOptimal, + .image = *reflectionDepth, + .subresourceRange = { vk::ImageAspectFlagBits::eDepth, 0, 1, 0, 1 } +}; + +std::array legacyBarriers{ legacyToColor, legacyToDepth }; + +// NOTICE: The stage masks are passed to the function, not the barriers! +cmd.pipelineBarrier( + vk::PipelineStageFlagBits::eColorAttachmentOutput | vk::PipelineStageFlagBits::eEarlyFragmentTests, // srcStageMask (Union) + vk::PipelineStageFlagBits::eColorAttachmentOutput | vk::PipelineStageFlagBits::eEarlyFragmentTests, // dstStageMask (Union) + {}, // dependencyFlags + nullptr, nullptr, legacyBarriers +); +---- + +In this legacy version, you are forced to pass a single `srcStageMask` that is the union of `eColorAttachmentOutput` and `eEarlyFragmentTests`. This means the GPU would have to wait for *both* the color writes and the depth tests of all previous work to finish before it could even *start* transitioning either image. + +With Synchronization 2, if the depth tests finish early, the driver can begin the `toDepth` transition immediately, even if the color hardware is still busy. This keeps the GPU's "log jam" from forming, allowing different parts of the hardware to work at their own pace. + +This code typically lives inside your frame recording logic, often in a dedicated pass function like `renderReflectionPass`, just before calling `beginRendering`. By placing the synchronization logic right where the resource is needed, and grouping related transitions into a single `vk::DependencyInfo`, you create a "Chain of Intent" that is both easy for you to read and optimal for the hardware to execute. + +== Implementation: Modernizing Simple Engine + +While `Simple Engine`'s renderer has been largely modernized to use the `pipelineBarrier2` calls we've discussed, the codebase still contains "legacy islands" that we are in the process of refactoring. A prime example is the `PhysicsSystem` (`physics_system.cpp`), which still uses the old-style `pipelineBarrier` for synchronizing its compute dispatches. + +If you look into `PhysicsSystem::SimulatePhysicsOnGPU`, you'll see transitions that look like this: + +[,cpp] +---- +// Legacy synchronization in PhysicsSystem +vulkanResources.commandBuffer.pipelineBarrier( + vk::PipelineStageFlagBits::eComputeShader, + vk::PipelineStageFlagBits::eComputeShader, + {}, + vk::MemoryBarrier(vk::AccessFlagBits::eShaderWrite, vk::AccessFlagBits::eShaderRead), + nullptr, nullptr +); +---- + +In the upcoming "Synchronization Upgrade" branch of `Simple Engine`, we will replace these with the cleaner `vk::DependencyInfo` and `vk::BufferMemoryBarrier2`. This will allow us to move away from global memory barriers and target the specific physics buffers, reducing the performance penalty on architectures with complex cache hierarchies. + +In the next section, we'll dive deeper into how to pick the right stages and flags to squeeze every last drop of performance out of the hardware. + +== Navigation + +Previous: xref:Synchronization/Anatomy_of_a_Dependency/02_execution_vs_memory.adoc[Execution vs. Memory Dependencies] | Next: xref:Synchronization/Anatomy_of_a_Dependency/04_refined_pipeline_stages.adoc[Refined Pipeline Stages] diff --git a/en/Synchronization/Anatomy_of_a_Dependency/04_refined_pipeline_stages.adoc b/en/Synchronization/Anatomy_of_a_Dependency/04_refined_pipeline_stages.adoc new file mode 100644 index 00000000..c2d579ec --- /dev/null +++ b/en/Synchronization/Anatomy_of_a_Dependency/04_refined_pipeline_stages.adoc @@ -0,0 +1,59 @@ +:pp: {plus}{plus} += Refined Pipeline Stages: Precision is Performance + +== Introduction + +In the previous sections, we saw how Synchronization 2 unifies the API. But the real performance gains come from how you use it. Mastering `vk::PipelineStageFlagBits2` and `vk::AccessFlagBits2` is about precision. In legacy Vulkan, many developers fell into the trap of using `eAllCommands` (or worse, `eTopOfPipe` and `eBottomOfPipe`) as a catch-all solution. While this "works" in the sense that it prevents data corruption, it’s the digital equivalent of stopping every car in the city just so one pedestrian can cross the street. + +=== The Pipeline Bubble + +When you use an overly broad stage mask, you create what’s known as a **Pipeline Bubble**. Modern GPUs are designed to keep as many specialized hardware units—the rasterizers, the compute cores, the fixed-function blit engines—busy as possible. If you tell the GPU to wait at `eAllCommands`, you are essentially draining the entire pipeline. The GPU must wait until every previous operation is completely finished before it can start even the smallest part of the next operation. + +image::/images/vulkan_pipeline_block_diagram.png[Vulkan Pipeline Block Diagram, width=800, alt="A block diagram of the Vulkan graphics pipeline showing its various stages"] + +With Synchronization 2, we can be far more surgical. If you're only interested in ensuring that a compute shader has finished writing to a storage buffer before a fragment shader reads it, you can target `eComputeShader` and `eFragmentShader` specifically. This allows other parts of the GPU, like the geometry engine or the rasterizer, to keep working on independent tasks. + +=== Choosing the Right Stage + +Picking the right stage mask requires a solid understanding of where your data is coming from and where it's going. Here are a few common patterns we use in our engine: + +* **Render to Texture**: If you're transitioning a color attachment so it can be sampled in a later pass, your source stage should be `eColorAttachmentOutput`. +* **Compute Post-Processing**: When a compute shader finishes a pass that will be used by the fragment shader, use `eComputeShader` as the source and `eFragmentShader` as the destination. +* **Transfer to Graphics**: When you've finished uploading a buffer or image using a transfer queue, the source stage is `eTransfer`. + +=== The Power of Access Flags + +Stage flags tell the GPU *when* to wait, but **Access Flags** tell it *why*. They control the cache flushes and invalidations we discussed in the "Execution vs. Memory" section. + +Pairing a stage with the correct access flag is vital. For example, if you're reading a storage buffer in a compute shader, you need `eShaderRead` or `eShaderStorageRead`. If you're writing to it, you need `eShaderWrite` or `eShaderStorageWrite`. Being specific here allows the hardware to perform only the necessary cache operations, which can significantly reduce the overhead of the barrier itself. + +=== Conclusion + +As we move forward into the more complex parts of this series—like asynchronous compute and asset streaming—keep this "precision-first" mindset. Every bit you set in a barrier is a hint to the hardware. The more accurate your hints, the smoother your frame rates will be. + +== Simple Engine: Targeting the Right Units + +In `Simple Engine`, we apply this precision-first approach in our `Renderer::Render` loop. For example, when transitioning our depth buffer from a "Depth-Only" pass (like our shadow map generation or depth pre-pass) to a "Depth-Test" pass (like our main opaque pass), we use: + +[,cpp] +---- +// Depth transition in Renderer::Render +vk::ImageMemoryBarrier2 depthToRead2{ + .srcStageMask = vk::PipelineStageFlagBits2::eLateFragmentTests, + .srcAccessMask = vk::AccessFlagBits2::eDepthStencilAttachmentWrite, + .dstStageMask = vk::PipelineStageFlagBits2::eEarlyFragmentTests, + .dstAccessMask = vk::AccessFlagBits2::eDepthStencilAttachmentRead, + .oldLayout = vk::ImageLayout::eDepthAttachmentOptimal, + .newLayout = vk::ImageLayout::eDepthAttachmentOptimal, + // ... + .image = *depthImage, +}; +---- + +By specifying `eLateFragmentTests` and `eEarlyFragmentTests`, we tell the GPU that it only needs to wait for the fixed-function depth units to finish writing before it can start reading for the next pass. The vertex shaders for the next pass can actually start running and even begin processing their geometry while the previous pass's depth writes are still being finalized. This overlap is what prevents the "Pipeline Bubble" and keeps our frame rates high even in complex scenes. + +Next, we'll take these foundational concepts and apply them to the most common synchronization task in Vulkan: the image layout transition. + +== Navigation + +Previous: xref:Synchronization/Anatomy_of_a_Dependency/03_sync2_advantage.adoc[The Synchronization 2 Advantage] | Next: xref:Synchronization/Pipeline_Barriers_Transitions/01_introduction.adoc[Pipeline Barriers and Transitions - Introduction] diff --git a/en/Synchronization/Anatomy_of_a_Dependency/05_conclusion.adoc b/en/Synchronization/Anatomy_of_a_Dependency/05_conclusion.adoc new file mode 100644 index 00000000..7adc89f5 --- /dev/null +++ b/en/Synchronization/Anatomy_of_a_Dependency/05_conclusion.adoc @@ -0,0 +1,26 @@ +:pp: {plus}{plus} += Anatomy of a Synchronization Dependency: Conclusion + +== Summary + +In this chapter, we have deconstructed the fundamental "handshake" that governs all GPU operations. By distinguishing between execution and memory dependencies, we've moved beyond simply "making it work" and towards a precise, performant understanding of how to orchestrate the complex parallelism of modern Vulkan. + +=== Summary of Barrier Types + +To solidify our understanding, let's summarize the three primary barrier types used in Synchronization 2: + +[cols="1,2,2", options="header"] +|=== +| Barrier Type | Purpose | Key Structure +| **Global Memory Barrier** | Synchronizes all memory accesses across specified pipeline stages. Use this when you need a broad flush that isn't tied to a specific resource. | `vk::MemoryBarrier2` +| **Buffer Memory Barrier** | Provides granular synchronization for a specific range of a `vk::Buffer`. Ideal for compute-to-compute or compute-to-vertex dependencies. | `vk::BufferMemoryBarrier2` +| **Image Memory Barrier** | The most powerful barrier. Handles execution and memory dependencies while also performing mandatory **Image Layout Transitions**. | `vk::ImageMemoryBarrier2` +|=== + +== Final Thoughts + +Every dependency you define is a contract between two operations. By being surgical with your pipeline stages and access flags, you ensure that the GPU spends its time processing data rather than waiting in unnecessary bubbles. Mastering this dependency is the first step towards building high-performance, asynchronous engine architectures. + +== Navigation + +Previous: xref:Synchronization/Anatomy_of_a_Dependency/04_refined_pipeline_stages.adoc[Refined Pipeline Stages] | Next: xref:Synchronization/Pipeline_Barriers_Transitions/01_introduction.adoc[Pipeline Barriers and Transitions] diff --git a/en/Synchronization/Async_Compute_Overlap/01_introduction.adoc b/en/Synchronization/Async_Compute_Overlap/01_introduction.adoc new file mode 100644 index 00000000..6808d50f --- /dev/null +++ b/en/Synchronization/Async_Compute_Overlap/01_introduction.adoc @@ -0,0 +1,28 @@ +:pp: {plus}{plus} += Asynchronous Compute & Execution Overlap: Parallelizing the GPU + +== Introduction + +In many rendering architectures, work is submitted as a linear sequence of events. We draw the shadows, then the geometry, then we run a compute-based post-processing pass. This "serial" approach is easy to understand, but it often leaves significant portions of the GPU hardware idle. Modern GPUs are composed of multiple independent units—graphics pipelines, compute units, and transfer engines—that can, and should, work simultaneously. + +**Asynchronous Compute** is the practice of running compute workloads (like physics, occlusion culling, or post-processing) on a dedicated compute queue while the main graphics queue is busy with its own work. When done correctly, this can lead to massive performance gains by effectively filling the "holes" in the GPU's execution timeline. + +== The "Bubble" Problem + +The primary enemy of high performance is the **Pipeline Stall**, often called a "bubble." This happens when one part of the GPU has finished its work but cannot start the next task because it's waiting for a dependency that hasn't been satisfied. If your barriers are too conservative—for example, if you tell the GPU to wait for "All Commands" to finish before starting a compute pass—you are essentially forcing the hardware into a serial mode, even if the compute work could have started much earlier. + +== Architecting for Overlap + +To achieve true execution overlap, we need to move beyond simple "top-of-pipe" to "bottom-of-pipe" dependencies. We need to architect our `vk::DependencyInfo` and our **Timeline Semaphores** to express the exact moment data is ready. + +In this chapter, we will explore: + +1. **Maximizing Throughput**: How to identify workloads that are good candidates for overlap and how to structure your submissions to keep the GPU occupancy as high as possible. +2. **Async Post-Processing**: We'll implement a common real-world pattern: running compute-based bloom or tonemapping concurrent with the subsequent frame's shadow or geometry pass. +3. **Eliminating the Stalls**: We'll learn how to use hardware profilers and synchronization validation to find those elusive "bubbles" and refine our stage masks to eliminate them. + +By the end of this chapter, you'll be able to move your engine from a serial sequence to a parallel execution model, ensuring that no hardware unit is left sitting idle. + +== Navigation + +Previous: xref:Synchronization/Frame_in_Flight/03_resource_lifetimes.adoc[Resource Lifetimes] | Next: xref:Synchronization/Async_Compute_Overlap/02_maximizing_throughput.adoc[Maximizing Throughput] diff --git a/en/Synchronization/Async_Compute_Overlap/02_maximizing_throughput.adoc b/en/Synchronization/Async_Compute_Overlap/02_maximizing_throughput.adoc new file mode 100644 index 00000000..13dc4090 --- /dev/null +++ b/en/Synchronization/Async_Compute_Overlap/02_maximizing_throughput.adoc @@ -0,0 +1,99 @@ +:pp: {plus}{plus} += Maximizing Throughput: Identifying Overlap Candidates + +== Finding the "Holes" in the GPU + +To maximize GPU throughput, we need to think beyond the simple linear execution of our command buffers. We want to find workloads that are **latency-bound** (spending a lot of time waiting for memory or fixed-function units) and pair them with workloads that are **compute-bound** (using the GPU's arithmetic units heavily). + +A classic example of this is the **Shadow Pass**. While the GPU is busy doing vertex processing and rasterizing depth-only geometry for shadows, many of the compute and shading units are sitting idle. This is a perfect "hole" that can be filled with an asynchronous compute task, such as a physics simulation or an occlusion culling pass. + +== The Simple Engine Case Study: Physics and Audio Compute + +In our `Simple Engine`, we have two major systems that are prime candidates for asynchronous compute: the **Physics System** (`physics_system.cpp`) and the **Audio HRTF System** (`audio_system.cpp`). + +The `PhysicsSystem` performs complex simulation tasks like integration and collision detection using GPU-accelerated compute shaders (`shaders/physics.slang`). Similarly, the `AudioSystem` uses a compute shader (`shaders/hrtf.slang`) to process audio spatialization (Head-Related Transfer Function) on the GPU. + +Currently, both systems follow a **sequential, blocking** pattern. For example, the physics simulation is submitted to the GPU, and the CPU immediately stalls at a fence: + +[,cpp] +---- +// Sequential Physics Dispatch (Current Engine) +physicsSystem->Update(deltaTime); // Internally calls SimulatePhysicsOnGPU + +// Inside PhysicsSystem::SimulatePhysicsOnGPU: +// 1. Submit compute commands to computeQueue +// 2. ReadbackGPUPhysicsData: blocks on a fence (CPU STALL!) +---- + +This CPU-side stall is a missed opportunity for overlap. To maximize throughput, we can re-architect this flow to be asynchronous by utilizing the engine's dedicated **Compute Queue** (obtained via `renderer->GetComputeQueue()`). By submitting these tasks early in the frame and only synchronizing when the data is strictly necessary, we can keep both the graphics and compute hardware units fully occupied. + +Beyond physics and audio, the engine's **Forward+ Rendering** path (see `ForwardPlus_Rendering.adoc`) is another prime candidate for overlap. The Forward+ compute pass (`forward_plus_cull.slang`) builds light lists for each tile on the screen. While this compute pass *does* require the depth buffer from the current frame to perform effective Z-culling, it doesn't need to wait for the entire geometry pass to finish. + +If we use **Timeline Semaphores**, we can tell the compute queue to wait only until the **Depth Pre-pass** is complete. While the graphics queue continues with the main **Opaque Geometry** rendering, the compute queue can simultaneously be culling lights for those same pixels, perfectly overlapping the compute-heavy light assignment with the raster-heavy geometry processing. + +== The Dependency Architecture + +The key to allowing these workloads to overlap is the way we architect our dependencies. If we use a single, global timeline for everything, we might inadvertently create a bottleneck. Instead, we should use multiple timeline semaphores—one for each major "engine" of the GPU—and have them coordinate only when strictly necessary. + +For example, your graphics queue could signal a "Geometry Complete" value on its own timeline. Your compute queue could wait for that value before starting its work, while simultaneously continuing with other tasks that don't depend on the geometry. + +[,cpp] +---- +// Compute queue waiting for graphics geometry completion +auto computeWaitInfo = vk::SemaphoreSubmitInfo{ + .semaphore = *graphicsTimeline, + .value = geometryFrameValue, + .stageMask = vk::PipelineStageFlagBits2::eComputeShader +}; + +auto computeSubmit = vk::SubmitInfo2{ + .waitSemaphoreInfoCount = 1, + .pWaitSemaphoreInfos = &computeWaitInfo, + // ... +}; + +computeQueue.submit2(computeSubmit); +---- + +== Submitting for Overlap + +Simply having multiple queues isn't enough. You also need to submit your work in a way that the hardware can actually parallelize. On most modern hardware, this means submitting your "background" compute work to a dedicated asynchronous compute queue. + +=== Identifying Dedicated Queues + +In Vulkan, queues are grouped into **Queue Families**. To get a truly asynchronous compute queue, you should look for a queue family that supports `vk::QueueFlagBits::eCompute` but NOT `vk::QueueFlagBits::eGraphics`. This ensures the hardware has a dedicated path for compute that doesn't share the same front-end command processor as the graphics unit. + +Here is how we identify these dedicated families in our engine: + +[,cpp] +---- +uint32_t computeQueueFamilyIndex = std::numeric_limits::max(); +auto queueFamilies = physicalDevice.getQueueFamilyProperties(); + +for (uint32_t i = 0; i < queueFamilies.size(); ++i) { + // Look for a family that has compute but NOT graphics for true async + if ((queueFamilies[i].queueFlags & vk::QueueFlagBits::eCompute) && + !(queueFamilies[i].queueFlags & vk::QueueFlagBits::eGraphics)) { + computeQueueFamilyIndex = i; + break; + } +} + +// Fallback: if no dedicated compute family exists, use any that supports compute +if (computeQueueFamilyIndex == std::numeric_limits::max()) { + for (uint32_t i = 0; i < queueFamilies.size(); ++i) { + if (queueFamilies[i].queueFlags & vk::QueueFlagBits::eCompute) { + computeQueueFamilyIndex = i; + break; + } + } +} +---- + +By decoupling the submission of your compute work from your main graphics loop using these dedicated queues, you allow the driver to schedule them concurrently. If the graphics queue is momentarily stalled (e.g., waiting for the display or a cache flush), the compute queue can step in and keep the hardware busy. + +In the next section, we'll see a concrete implementation of this pattern: async post-processing. + +== Navigation + +Previous: xref:Synchronization/Async_Compute_Overlap/01_introduction.adoc[Introduction] | Next: xref:Synchronization/Async_Compute_Overlap/03_async_post_processing.adoc[Async Post-Processing] diff --git a/en/Synchronization/Async_Compute_Overlap/03_async_post_processing.adoc b/en/Synchronization/Async_Compute_Overlap/03_async_post_processing.adoc new file mode 100644 index 00000000..b90d3103 --- /dev/null +++ b/en/Synchronization/Async_Compute_Overlap/03_async_post_processing.adoc @@ -0,0 +1,72 @@ +:pp: {plus}{plus} += Async Post-Processing: Parallelizing Frame End and Start + +== A Real-World Use Case + +One of the most effective ways to use asynchronous compute is to run your post-processing pass (which is usually compute-bound) while the graphics unit is busy with the shadow or geometry pass of the *next* frame. This is a powerful pattern because post-processing typically happens at the very end of the frame, when the graphics units have finished their work. Instead of making the next frame wait for post-processing to complete, we move it to a dedicated compute queue. + +== Implementing the Overlap + +The implementation involves two different queues: a **Graphics Queue** for your geometry and shadow work, and an **Asynchronous Compute Queue** for your post-processing work (e.g., bloom, tonemapping, or temporal anti-aliasing). + +1. **Main Render Pass (Graphics Queue)**: Once your main rendering is complete, signal a "Graphics Complete" value on your graphics timeline. +2. **Post-Processing Pass (Compute Queue)**: The compute queue waits for the "Graphics Complete" value. It then performs the post-processing work and signals a "Post-Processing Complete" value on its own compute timeline. +3. **Frame Submission (CPU)**: The CPU can start recording and submitting the *next* frame to the graphics queue as soon as the previous frame's geometry is submitted. It doesn't need to wait for the post-processing to finish. + +== Synchronization 2 Example + +Using `vk::DependencyInfo` and `vk::SubmitInfo2`, this coordination is clear and precise. + +[,cpp] +---- +// Compute Submit: wait for frame N graphics to finish, then run post-processing +auto computeWaitInfo = vk::SemaphoreSubmitInfo{ + .semaphore = *graphicsTimeline, + .value = frameN_graphics_finished, + .stageMask = vk::PipelineStageFlagBits2::eComputeShader +}; + +auto computeSignalInfo = vk::SemaphoreSubmitInfo{ + .semaphore = *computeTimeline, + .value = frameN_postprocessing_finished, + .stageMask = vk::PipelineStageFlagBits2::eComputeShader +}; + +auto computeSubmit = vk::SubmitInfo2{ + .waitSemaphoreInfoCount = 1, + .pWaitSemaphoreInfos = &computeWaitInfo, + .signalSemaphoreInfoCount = 1, + .pSignalSemaphoreInfos = &computeSignalInfo, + .commandBufferInfoCount = 1, + .pCommandBufferInfos = &postProcessCmdInfo +}; + +computeQueue.submit2(computeSubmit); +---- + +== Handling the Present + +The final step is the **Present** operation. On the CPU side, you must ensure that you don't present the final image until both the graphics and compute work for that frame are complete. + +Specifically, because the presentation unit (WSI) doesn't yet support timeline semaphores, you must use a **binary semaphore** for the final handshake. Your compute queue signals a binary semaphore upon completion of the post-processing pass. The `vk::PresentInfoKHR` then waits on this binary semaphore before displaying the image to the screen. + +This three-way handshake—graphics signals compute, compute signals present—ensures that the graphics units are always fed with new work (from the *next* frame), while the compute units handle the final look of the *current* frame. It's a key strategy for maximizing your engine's frame rate and keeping your GPU occupancy as high as possible. + +== Implementing in Simple Engine + +In `Simple Engine`, we will apply this async post-processing pattern to our **PBR Tonemapping** pass. Currently, the tonemapping is done at the end of `Renderer::Render` on the graphics queue. We will move this logic to a dedicated `postProcessComputePipeline` that runs on the `computeQueue`. + +To implement this: + +1. **Add Compute Pass**: We'll update our `Renderer` to record the tonemapping compute shader (`shaders/tonemap.slang`) into a separate compute command buffer. +2. **Wait for Graphics**: This compute command buffer will wait for the main rendering timeline to reach the `GeometryFinished` value. +3. **Signal for Present**: Once the tonemapping is complete, it will signal a `PostProcessFinished` value. +4. **Update Submit**: We'll update our final `vk::SubmitInfo2` for the frame so that the present operation waits for this `PostProcessFinished` value on the compute timeline. + +By moving tonemapping to the compute queue, we can start the **next frame's shadow pass** on the graphics queue while the current frame is still being tonemapped. This overlaps the raster-heavy shadow pass with the compute-heavy tonemapping pass, significantly improving our overall frame throughput. + +In the final section of this chapter, we'll look at how to identify and eliminate the "bubbles" that can occur if your synchronization is too conservative. + +== Navigation + +Previous: xref:Synchronization/Async_Compute_Overlap/02_maximizing_throughput.adoc[Maximizing Throughput] | Next: xref:Synchronization/Async_Compute_Overlap/04_bubble_problem.adoc[The Bubble Problem] diff --git a/en/Synchronization/Async_Compute_Overlap/04_bubble_problem.adoc b/en/Synchronization/Async_Compute_Overlap/04_bubble_problem.adoc new file mode 100644 index 00000000..85e52097 --- /dev/null +++ b/en/Synchronization/Async_Compute_Overlap/04_bubble_problem.adoc @@ -0,0 +1,30 @@ +:pp: {plus}{plus} += The Bubble Problem: Finding and Fixing Stalls + +== Identifying the Bubble + +A "bubble" in the GPU timeline is a period where some units are idle because they are waiting for a dependency to be satisfied. These can be hard to find just by looking at your code. You might *think* you've enabled overlap, but if your stage masks are too broad, the GPU might still be stalling. + +To find these, we use hardware profilers like **NVIDIA Nsight Graphics**, **AMD Radeon GPU Profiler**, or even the **LunarG Synchronization Validation** layer. In a profiler, a bubble looks like a gap in the timeline where the Graphics or Compute rows are empty while the other is busy. + +image::/images/vulkan_simplified_pipeline.svg[Vulkan Simplified Pipeline, width=400, alt="Simplified diagram of the Vulkan pipeline used to illustrate where bubbles can occur"] + +== Common Causes of Bubbles + +1. **Overly Conservative Stage Masks**: If you use `vk::PipelineStageFlagBits2::eAllCommands` for every barrier, the GPU will flush everything and wait for it to be idle before starting the next task. This is the most common cause of bubbles. Always use the most specific stage mask possible. +2. **Sequential Submission**: Even if you have two queues, if your CPU code waits for one to finish before submitting to the other, you've created a bubble on the CPU side. Use the **Wait-Before-Signal** pattern and multiple submission threads where appropriate. +3. **Dependency Chains**: A chain of small dependencies can sometimes be more expensive than one slightly broader barrier. If you have five compute passes that all wait for each other, each one introduces a small stall. Sometimes batching these into a single compute submission is better. + +== Fixing the Stall + +Once you've found a bubble, the fix is usually to refine your `vk::DependencyInfo`. + +- **Refine Stage Masks**: Check if you can move your `srcStageMask` later in the pipeline or your `dstStageMask` earlier. For example, can your compute work start as soon as `eVertexShader` is done, instead of waiting for `eFragmentShader`? +- **Use Memory Barriers Wisely**: Sometimes a global memory barrier is better than several image barriers if it allows more work to start sooner. +- **Increase Concurrency**: If your profiler shows that the compute units are under-utilized, can you move more work (like occlusion culling) from graphics to compute? + +By systematically finding and eliminating these bubbles, you move from a renderer that "just works" to one that is truly professional-grade. In the next chapter, we'll see how these same principles apply to one of the most common background tasks in modern games: asset streaming. + +== Navigation + +Previous: xref:Synchronization/Async_Compute_Overlap/03_async_post_processing.adoc[Async Post-Processing] | Next: xref:Synchronization/Transfer_Queues_Streaming/01_introduction.adoc[Transfer Queues & Asset Streaming Sync] diff --git a/en/Synchronization/Dynamic_Rendering_Sync/01_introduction.adoc b/en/Synchronization/Dynamic_Rendering_Sync/01_introduction.adoc new file mode 100644 index 00000000..97df7e1a --- /dev/null +++ b/en/Synchronization/Dynamic_Rendering_Sync/01_introduction.adoc @@ -0,0 +1,27 @@ +:pp: {plus}{plus} += Synchronization in Dynamic Rendering: A Pass-less World + +== Introduction + +For much of its early history, Vulkan synchronization was tied heavily to the concept of **Render Passes** and **Subpasses**. While this was designed to help mobile GPUs optimize on-tile memory usage, it was often confusing for developers and led to overly complex code. The "Subpass Dependency" was the primary way to sync data between different stages of a render pass, but it felt like a legacy structure that didn't always match the way modern engines work. + +With the introduction of **Dynamic Rendering** (introduced in Vulkan 1.3), the API has moved away from these rigid structures. There are no more `VkRenderPass` or `VkFramebuffer` objects to manage. Instead, you simply call `beginRendering` and `endRendering`. This change has made Vulkan much easier to use, but it has also shifted the responsibility for synchronization entirely to us. + +== The Explicit Era + +In a world without subpass dependencies, every synchronization point must be explicit. If you want to use the output of one draw call as the input for another within the same rendering block, you can no longer rely on the render pass to handle the transition for you. You must use the **Synchronization 2** barriers we learned about in Chapter 3. + +This shift is actually a major advantage. It provides far more clarity and control. You know exactly where your transitions are happening because you recorded them yourself. It also makes it much easier to integrate with modern engine architectures where rendering passes are fluid and often determined at runtime. + +== What We'll Explore + +In this chapter, we'll dive into how synchronization works in this modern, pass-less landscape. We'll explore: + +1. **Subpass Replacement**: How to use explicit barriers to coordinate synchronization between rendering attachments, replacing the legacy `VkSubpassDependency` structures. +2. **Local Read Sync**: We'll look at one of the most exciting features of **Vulkan 1.4**: `VK_KHR_dynamic_rendering_local_read`. This allows you to perform on-tile operations (like reading from a depth buffer in a fragment shader) with the same performance as legacy subpasses but with the simplicity of dynamic rendering. + +By the end of this chapter, you'll be able to confidently architect a high-performance renderer using the latest Vulkan features, ensuring that your synchronization is as streamlined and efficient as your rendering code. + +== Navigation + +Previous: xref:Synchronization/Transfer_Queues_Streaming/03_staging_sync.adoc[Staging Synchronization] | Next: xref:Synchronization/Dynamic_Rendering_Sync/02_subpass_replacement.adoc[Subpass Replacement] diff --git a/en/Synchronization/Dynamic_Rendering_Sync/02_subpass_replacement.adoc b/en/Synchronization/Dynamic_Rendering_Sync/02_subpass_replacement.adoc new file mode 100644 index 00000000..ff89c34f --- /dev/null +++ b/en/Synchronization/Dynamic_Rendering_Sync/02_subpass_replacement.adoc @@ -0,0 +1,63 @@ +:pp: {plus}{plus} += Subpass Replacement: Syncing Without the Pass + +== The End of the Subpass Dependency + +In the legacy Vulkan "Render Pass" system, you defined your dependencies upfront. If you wanted to use a G-Buffer pass and then a lighting pass, you'd create a subpass dependency that specified how data was transitioned and synchronized. This was often confusing because it separated the synchronization from the actual commands that were using it. + +With **Dynamic Rendering**, we replace these dependencies with **Synchronization 2** barriers that we record directly between our draw calls. This approach is far more intuitive. If your second draw call needs to read from the output of the first, you record a barrier in between. + +== A Concrete Example + +Imagine you're building a G-Buffer. You have a "Depth-Only" pass to pre-populate the depth buffer, followed by a "Main Pass" that reads from that depth buffer for early-Z testing. + +[,cpp] +---- +// 1. Depth Pre-Pass +commandBuffer.beginRendering(depthPrePassInfo); +// ... record depth draw calls ... +commandBuffer.endRendering(); + +// 2. Synchronization Barrier +auto depthBarrier = vk::ImageMemoryBarrier2{ + .srcStageMask = vk::PipelineStageFlagBits2::eLateFragmentTests, + .srcAccessMask = vk::AccessFlagBits2::eDepthStencilAttachmentWrite, + .dstStageMask = vk::PipelineStageFlagBits2::eEarlyFragmentTests, + .dstAccessMask = vk::AccessFlagBits2::eDepthStencilAttachmentRead, + .oldLayout = vk::ImageLayout::eDepthAttachmentOptimal, + .newLayout = vk::ImageLayout::eDepthAttachmentOptimal, // No layout change needed + .image = depthBuffer.image(), + .subresourceRange = subresourceRange +}; + +commandBuffer.pipelineBarrier2(vk::DependencyInfo{.imageMemoryBarrierCount = 1, .pImageMemoryBarriers = &depthBarrier}); + +// 3. Main Pass +commandBuffer.beginRendering(mainPassInfo); +// ... record main draw calls ... +commandBuffer.endRendering(); +---- + +== Why This is Better + +- **Clarity**: You can see exactly what is being synchronized and why, right there in your command buffer. +- **Flexibility**: You can decide on the synchronization at runtime, making it much easier to build a flexible rendering graph. +- **Modernity**: It matches the way other modern APIs, like DirectX 12, handle synchronization, making your engine code more portable. + +By using explicit barriers, you move away from the "black box" of the legacy render pass system and toward a clear, surgical synchronization architecture. In the next section, we'll see how **Vulkan 1.4** takes this even further by allowing for efficient on-tile read operations. + +== Simple Engine: Dynamic Rendering Sync + +In `Simple Engine`, we use this explicit synchronization between our **Opaque Pre-Pass** and our **Main Pass**. Because we don't have a traditional render pass to handle these transitions, we record our own `vk::ImageMemoryBarrier2` to ensure the depth buffer is properly flushed and invalidated. + +Specifically, in `Renderer::Render`, you'll find the following sequence: + +1. **Depth Pre-Pass**: We call `commandBuffer.beginRendering` for the depth pre-pass. +2. **Barrier**: After `endRendering`, we record a `depthToRead2` barrier. This barrier synchronizes the `eLateFragmentTests` (the depth writes) with the `eEarlyFragmentTests` (the depth reads) of the next pass. +3. **Main Opaque Pass**: We then call `beginRendering` again for our main opaque color pass, which now has safe access to the pre-filled depth buffer. + +This explicit approach is what allowed us to easily add **Forward+ Lighting** to `Simple Engine`. Since we already had the depth buffer synchronized, adding the light culling compute pass between the pre-pass and the main pass was a straightforward matter of adding one more barrier, without having to re-architect a complex legacy render pass. + +== Navigation + +Previous: xref:Synchronization/Dynamic_Rendering_Sync/01_introduction.adoc[Introduction] | Next: xref:Synchronization/Dynamic_Rendering_Sync/03_local_read_sync.adoc[Local Read Sync] diff --git a/en/Synchronization/Dynamic_Rendering_Sync/03_local_read_sync.adoc b/en/Synchronization/Dynamic_Rendering_Sync/03_local_read_sync.adoc new file mode 100644 index 00000000..0545427d --- /dev/null +++ b/en/Synchronization/Dynamic_Rendering_Sync/03_local_read_sync.adoc @@ -0,0 +1,67 @@ +:pp: {plus}{plus} += Local Read Sync: On-Tile Efficiency with Dynamic Rendering + +== The Best of Both Worlds + +In the legacy render pass system, we used subpasses to perform efficient on-tile read operations. This allowed the GPU to read from a color or depth attachment directly from its on-chip memory (the tile cache), avoiding expensive trips to main memory. This was a critical optimization for mobile and tiled-rendering GPUs. + +With the introduction of **Vulkan 1.4**, this same efficiency is now available in **Dynamic Rendering** through the `VK_KHR_dynamic_rendering_local_read` feature. This gives us the simplicity of a "pass-less" world with the performance of a subpass-based world. + +== Implementing the Local Read + +The implementation involves two parts: a specialized barrier and a specific rendering setup. When you use a local read, you tell the GPU: "I want to read from an attachment, but I promise the read will only occur at the same pixel (x, y) location as the current write." This allows the hardware to keep the data on-tile. + +[,cpp] +---- +// 1. Define the Dependency +auto localReadBarrier = vk::ImageMemoryBarrier2{ + .srcStageMask = vk::PipelineStageFlagBits2::eColorAttachmentOutput, + .srcAccessMask = vk::AccessFlagBits2::eColorAttachmentWrite, + .dstStageMask = vk::PipelineStageFlagBits2::eFragmentShader, + .dstAccessMask = vk::AccessFlagBits2::eInputAttachmentRead, + .oldLayout = vk::ImageLayout::eRenderingLocalRead, + .newLayout = vk::ImageLayout::eRenderingLocalRead, + .image = gBufferAttachment.image(), + .subresourceRange = subresourceRange +}; + +commandBuffer.pipelineBarrier2(vk::DependencyInfo{.imageMemoryBarrierCount = 1, .pImageMemoryBarriers = &localReadBarrier}); + +// 2. Perform the Rendering +// You must include the local read information in your RenderingInfo +auto localReadInfo = vk::RenderingInputAttachmentIndexInfo{ + .colorAttachmentCount = 1, + .pColorAttachmentInputIndices = &colorIndex +}; + +auto renderingInfo = vk::RenderingInfo{ + .pNext = &localReadInfo, + // ... +}; + +commandBuffer.beginRendering(renderingInfo); +// ... record your on-tile reads in your Slang shader ... +commandBuffer.endRendering(); +---- + +== Slang Integration + +In your Slang shader, you use the standard input attachment syntax. The Slang compiler will correctly target the SPIR-V instructions required for local read access. This ensures that your shader code remains clean and portable across different hardware. + +[,cpp] +---- +// Slang snippet +[[vk::input_attachment_index(0)]] +InputAttachment gBufferInput; + +float4 main(float2 uv : TEXCOORD0) : SV_Target { + float4 data = gBufferInput.SubpassLoad(); + // ... +} +---- + +By mastering local read synchronization, you can build a modern deferred renderer that is every bit as efficient as a legacy subpass-based renderer, but with the flexibility and clarity of modern Vulkan. In the next chapter, we'll see how these principles apply to the direct CPU-to-GPU data movements in **Host Image Copies**. + +== Navigation + +Previous: xref:Synchronization/Dynamic_Rendering_Sync/02_subpass_replacement.adoc[Subpass Replacement] | Next: xref:Synchronization/Host_Image_Copies_Memory_Sync/01_introduction.adoc[Host Image Copies & Memory Mapped Sync] diff --git a/en/Synchronization/Frame_in_Flight/01_introduction.adoc b/en/Synchronization/Frame_in_Flight/01_introduction.adoc new file mode 100644 index 00000000..efaf6cfa --- /dev/null +++ b/en/Synchronization/Frame_in_Flight/01_introduction.adoc @@ -0,0 +1,30 @@ +:pp: {plus}{plus} += Frame-in-Flight Architecture: The Heartbeat of Your Engine + +== Introduction + +In the early days of graphics programming, we often thought of rendering as a linear sequence: the CPU records some commands, the GPU executes them, and then the CPU waits for the GPU to finish before starting the next frame. This is simple, but it’s also incredibly slow. While the GPU is rendering, the CPU is sitting idle, and while the CPU is recording the next frame, the GPU is waiting for work. + +To achieve high performance, we need to overlap these two processes. This is what we call **Frame-in-Flight Architecture**. We want to have multiple frames being processed simultaneously—for example, the CPU might be recording frame 3, while the GPU is still rendering frame 2, and the display is currently showing frame 1. This concept is introduced in the base tutorial's xref:03_Drawing_a_triangle/03_Drawing/03_Frames_in_flight.adoc[Frames in flight] chapter, but here we take it to the next level using timeline semaphores. + +== The Synchronization Challenge + +Managing multiple concurrent frames is arguably the most complex synchronization challenge in a Vulkan engine. You have to ensure that: + +1. **Data Integrity**: You don't overwrite a uniform buffer that the GPU is currently reading for a previous frame. +2. **Resource Lifetimes**: You don't destroy a texture or a command buffer until you are absolutely certain the GPU has finished using it. +3. **Forward Progress**: You don't submit so many frames that you introduce massive input lag or run out of memory. + +In the legacy Vulkan 1.0 world, this was handled using a complex array of fences and binary semaphores for each frame in flight. This led to "sync-heavy" code that was difficult to scale and easy to break. + +== The Timeline Advantage + +By using **Timeline Semaphores** as our foundation, we can drastically simplify this architecture. Instead of managing a separate fence for every frame, we use a single monotonic counter that represents the "completed frame index." + +In this chapter, we are going to rebuild the main engine loop to handle an arbitrary number of frames in flight. We'll explore how to use the timeline to coordinate between the CPU and GPU, and how to implement a robust resource management system that uses the timeline to determine exactly when it's safe to destroy or reuse our Vulkan objects. + +Let's begin by looking at how to rebuild the heartbeat of our engine: the main render loop. + +== Navigation + +Previous: xref:Synchronization/Timeline_Semaphores/04_wait_before_signal.adoc[Wait-Before-Signal Submission] | Next: xref:Synchronization/Frame_in_Flight/02_managing_concurrent_frames.adoc[Managing Concurrent Frames] diff --git a/en/Synchronization/Frame_in_Flight/02_managing_concurrent_frames.adoc b/en/Synchronization/Frame_in_Flight/02_managing_concurrent_frames.adoc new file mode 100644 index 00000000..d3ea5b64 --- /dev/null +++ b/en/Synchronization/Frame_in_Flight/02_managing_concurrent_frames.adoc @@ -0,0 +1,106 @@ +:pp: {plus}{plus} += Managing Concurrent Frames: Rebuilding the Main Loop + +== The Goal: Overlap Without Chaos + +The purpose of a frame-in-flight system is simple: keep the GPU busy while the CPU prepares future frames. The trick is doing that without corrupting data or introducing unbounded latency. With timeline semaphores, we can express this cleanly using a single, monotonic value that represents "frame N is complete." + +== A Practical Structure + +We'll use a ring of per-frame data (command buffers, descriptor sets, transient buffers). Each frame has an associated timeline value that marks when it's safe to reuse those resources. + +[,cpp] +---- +struct FrameContext { + vk::raii::CommandPool pool{nullptr}; + vk::raii::CommandBuffer cmd{nullptr}; + vk::raii::Fence fence{nullptr}; // optional if you only use timeline waits on CPU + uint64_t retireValue = 0; // timeline value when this frame finishes +}; + +std::array frames; + +vk::raii::Semaphore timeline = createTimelineSemaphore(device, /*initial=*/0); +uint64_t nextSubmitValue = 1; // monotonically increasing +---- + +== The Main Loop With Timeline Gating + +On each frame, choose the next `FrameContext` in the ring. Before you touch any of its resources, make sure the global timeline has advanced beyond the value at which those resources were last used. + +[,cpp] +---- +FrameContext& fc = frames[currentFrameIndex]; + +// Wait until GPU has reached the value when this frame's resources were last retired +if (fc.retireValue != 0) { + auto waitInfo = vk::SemaphoreWaitInfo{ + .semaphoreCount = 1, + .pSemaphores = &(*timeline), + .pValues = &fc.retireValue + }; + device.waitSemaphores(waitInfo, /*timeoutNs=*/UINT64_C(1'000'000'000)); // 1s timeout +} + +// Record & submit this frame +recordCommands(fc.cmd /*, ... */); + +// Define the value that represents "this frame complete" +const uint64_t frameComplete = nextSubmitValue++; + +vk::SemaphoreSubmitInfo signalInfo{ + .semaphore = *timeline, + .value = frameComplete, + .stageMask = vk::PipelineStageFlagBits2::eAllCommands +}; + +vk::CommandBufferSubmitInfo cmdInfo{ .commandBuffer = *fc.cmd }; + +vk::SubmitInfo2 submit{ + .commandBufferInfoCount = 1, + .pCommandBufferInfos = &cmdInfo, + .signalSemaphoreInfoCount= 1, + .pSignalSemaphoreInfos = &signalInfo +}; + +graphicsQueue.submit2(submit); + +// Tag this frame's resources with the value at which they're safe to reuse +fc.retireValue = frameComplete; +---- + +== CPU Throttle Without Fences + +To limit latency (e.g., only 2–3 frames in flight), wait for the value that corresponds to "the oldest in-flight frame has finished" before starting a new one. No per-frame fences necessary. + +[,cpp] +---- +const uint64_t minAllowedValue = frameCompleteValueFor(currentFrameIndex - (MaxFramesInFlight - 1)); +if (minAllowedValue) { + auto waitInfo = vk::SemaphoreWaitInfo{ + .semaphoreCount = 1, + .pSemaphores = &(*timeline), + .pValues = &minAllowedValue + }; + device.waitSemaphores(waitInfo, UINT64_MAX); +} +---- + +This approach centralizes flow control around a single, debuggable counter. In the next section, we'll use the same counter to make precise, low-overhead decisions about resource destruction and reuse. + +== How to implement this in Simple Engine + +To implement this in `Simple Engine`, we will refactor the `Renderer::Render` method. Currently, it relies on `inFlightFences[currentFrame]` to stall the CPU. We will replace this with a single `Renderer::frameTimeline` semaphore. + +The new logic will look like this: + +1. **Calculate Retire Value**: Instead of `waitForFences`, we will calculate the `retireValue` for the current frame slot. This is simply the timeline value assigned to this slot the last time it was submitted (e.g., `frameTimelineValue[currentFrame]`). +2. **Wait on Timeline**: We'll call `device.waitSemaphores` to wait for that `retireValue`. This ensures the GPU is finished with the resources (command buffers, descriptor sets) associated with this frame slot. +3. **Submit with Signal**: When we call `queue.submit2`, we'll include a `vk::SemaphoreSubmitInfo` that signals our `frameTimeline` with a new, incremented value. +4. **Update Frame Slot**: We'll store this new signal value in `frameTimelineValue[currentFrame]` so we can wait for it the next time this slot comes around in the ring. + +This refactor will allow us to remove the `inFlightFences` array entirely, simplifying our resource management and making it easier to integrate other asynchronous systems into the same "Master Clock." + +== Navigation + +Previous: xref:Synchronization/Frame_in_Flight/01_introduction.adoc[Introduction] | Next: xref:Synchronization/Frame_in_Flight/03_resource_lifetimes.adoc[Resource Lifetimes] diff --git a/en/Synchronization/Frame_in_Flight/03_resource_lifetimes.adoc b/en/Synchronization/Frame_in_Flight/03_resource_lifetimes.adoc new file mode 100644 index 00000000..e6cbfdbd --- /dev/null +++ b/en/Synchronization/Frame_in_Flight/03_resource_lifetimes.adoc @@ -0,0 +1,61 @@ +:pp: {plus}{plus} += Resource Lifetimes: Safe Reuse Without deviceWaitIdle() + +== Tagging and Reclamation + +One of the biggest challenges in Vulkan is knowing when it's safe to reuse or destroy a resource. With timeline semaphores, we treat destruction and reuse as a function of the global counter: a resource becomes eligible for reclamation when the counter exceeds the value at which it was last used. + +We maintain a small allocator or freelist for transient resources (command buffers, staging buffers, descriptor sets). Each allocation is tagged with a `retireValue`. + +[,cpp] +---- +struct TrackedResource { + ResourceHandle handle{}; // your wrapper around vk objects + uint64_t retireValue = 0; // timeline value when last submitted use completes +}; + +void destroyWhenSafe(TrackedResource res) { + deferredDeletes.push_back(res); +} + +void gc(vk::raii::Device const& device, vk::raii::Semaphore const& timeline) { + const uint64_t now = device.getSemaphoreCounterValue(*timeline); + auto it = std::remove_if(deferredDeletes.begin(), deferredDeletes.end(), [&](TrackedResource const& r){ + if (now >= r.retireValue) { destroy(r.handle); return true; } + return false; + }); + deferredDeletes.erase(it, deferredDeletes.end()); +} +---- + +== Integrating With Submissions + +Whenever you submit work that references a resource, tag it with the same value you signal on the timeline for that submission. + +[,cpp] +---- +const uint64_t submissionValue = nextSubmitValue++; +submitCommands(cmd, /*signals*/ submissionValue); + +TrackedResource tex = createTexture(/*...*/); +tex.retireValue = submissionValue; // safe to reuse/destroy once reached +---- + +This pattern scales to complex graphs. You can attach `retireValue`s to entire resource sets created for a frame, or to individual allocations in sub-systems like upload managers. + +== Simple Engine: Garbage Collection + +In `Simple Engine`, we currently handle deferred resource destruction using a simple "frames since destroy" counter in our `pendingASDeletions` queue (found in `renderer_rendering.cpp`). This system waits for a fixed number of frames (`MAX_FRAMES_IN_FLIGHT + 1`) before deleting an acceleration structure. While safe, it is imprecise and can lead to resources staying in memory longer than necessary if the GPU is running fast. + +By moving to a timeline-based **Garbage Collection (GC)** system, we can be much more efficient. We will tag each `pendingASDeletion` (and any other transient resource, like our staging buffers) with the exact `frameTimelineValue` at which it was last used. Our `Renderer::ProcessDeferredDeletions` function will then query the current `frameTimeline` value. If the GPU has already reached or passed the tagged value, we can delete the resource immediately. This ensures that memory is reclaimed as soon as the GPU is done with it, regardless of the current frame rate or CPU/GPU load. + +== Pitfalls and Best Practices + +- Don't leak values: keep `nextSubmitValue` monotonic but bounded in meaning (e.g., encode frame and pass indices) to aid debugging. +- Batch deletions in `gc()` to avoid per-frame spikes. +- Avoid mixing fences and timeline for the same lifetime decision to prevent contradictory states. +- For external queues/devices (e.g., interop), convert their completion signals into your timeline domain where possible. + +== Navigation + +Previous: xref:Synchronization/Frame_in_Flight/02_managing_concurrent_frames.adoc[Managing Concurrent Frames] | Next: xref:Synchronization/Async_Compute_Overlap/01_introduction.adoc[Asynchronous Compute & Execution Overlap - Introduction] diff --git a/en/Synchronization/Host_Image_Copies_Memory_Sync/01_introduction.adoc b/en/Synchronization/Host_Image_Copies_Memory_Sync/01_introduction.adoc new file mode 100644 index 00000000..3adc5a46 --- /dev/null +++ b/en/Synchronization/Host_Image_Copies_Memory_Sync/01_introduction.adoc @@ -0,0 +1,33 @@ +:pp: {plus}{plus} += Host Image Copies & Memory Mapped Sync: Direct Access + +== Introduction + +For most of Vulkan's history, if you wanted to move data into an image, you had to follow a very specific ritual: create a staging buffer, map it, write your data, record a `copyBufferToImage` command, and then submit that command buffer to a queue. While this is efficient for large, asynchronous uploads, it's a lot of overhead for simple, direct updates—like updating a small UI texture or a single mip level. + +With the arrival of **Vulkan 1.4**, we have a powerful new tool: **Host Image Copies** (`VK_EXT_host_image_copy`). This feature allows the CPU to copy data directly into a GPU-optimal image without recording or submitting a single command buffer. It's the most direct way to move data between CPU and GPU memory. + +[NOTE] +==== +While Host Image Copies were promoted to core in Vulkan 1.4, support for this feature is still **optional**. You must check the `VkPhysicalDeviceHostImageCopyFeaturesEXT` (or the 1.4 equivalent) to ensure your hardware and driver support this direct path. +==== + +== The Synchronization Challenge + +While Host Image Copies simplify the "how" of moving data, they don't exempt us from the "when." Because we are moving data directly on the host (CPU), we must be extremely careful to ensure that the GPU isn't trying to use that same image while we are writing to it. + +This introduces a different kind of synchronization. We aren't just syncing two GPU queues; we are syncing the **Host** with the **Device**. We need to ensure that our host writes are **visible** to the GPU, and that any previous GPU work is **available** before we start our host copy. + +== What We'll Explore + +In this chapter, we'll dive into the world of host-side synchronization. We'll explore: + +1. **Direct CPU-to-Image Access**: How to utilize the new Vulkan 1.4 Host Image Copy features to move data efficiently without command buffer overhead. +2. **Visibility and Flushes**: Mastering `vk::MemoryBarrier2` specifically for host-mapped memory. We'll learn how to ensure data coherency across the bus, ensuring that the bytes we write on the CPU are exactly what the GPU sees. +3. **Host-Device Handshakes**: Coordinating with fences and timeline semaphores to ensure that our host-side copies never collide with active GPU rendering. + +By the end of this chapter, you'll have a complete understanding of how to manage direct memory access in modern Vulkan, providing you with a faster, more flexible way to keep your assets updated. + +== Navigation + +Previous: xref:Synchronization/Dynamic_Rendering_Sync/03_local_read_sync.adoc[Local Read Sync] | Next: xref:Synchronization/Host_Image_Copies_Memory_Sync/02_cpu_to_image_access.adoc[Direct CPU-to-Image Access] diff --git a/en/Synchronization/Host_Image_Copies_Memory_Sync/02_cpu_to_image_access.adoc b/en/Synchronization/Host_Image_Copies_Memory_Sync/02_cpu_to_image_access.adoc new file mode 100644 index 00000000..6f1e7ab4 --- /dev/null +++ b/en/Synchronization/Host_Image_Copies_Memory_Sync/02_cpu_to_image_access.adoc @@ -0,0 +1,56 @@ +:pp: {plus}{plus} += Direct CPU-to-Image Access: Utilizing Host Image Copies + +== Moving Data Directly on the Host + +One of the most powerful features in **Vulkan 1.4** is the ability to move data directly on the host (CPU). This is handled by the `vk::Device::copyMemoryToImageEXT` function (or its equivalent in your RAII wrapper). This function takes raw CPU memory and copies it directly into a GPU-optimal image. + +This is a major productivity boost. You no longer have to manage staging buffers, command pools, or submission queues for simple, direct image updates. It's the most direct way to move data from a CPU-side resource, like a dynamic texture or a screenshot buffer, into a GPU-side image. + +== Implementing the Host Copy + +To use this feature, you first need to check for support for the `VK_EXT_host_image_copy` extension (now part of Vulkan 1.4). Once confirmed, you can perform a copy like this: + +[,cpp] +---- +// 1. Prepare the copy info +auto copyInfo = vk::MemoryToImageCopyEXT{ + .pHostPointer = cpuData, + .memoryRowLength = width, + .memoryImageHeight = height, + .imageSubresource = { .aspectMask = vk::ImageAspectFlagBits::eColor, .mipLevel = 0, .baseArrayLayer = 0, .layerCount = 1 }, + .imageExtent = { .width = width, .height = height, .depth = 1 } +}; + +auto hostCopyInfo = vk::CopyMemoryToImageInfoEXT{ + .dstImage = *gpuImage, + .dstImageLayout = vk::ImageLayout::eGeneral, + .regionCount = 1, + .pRegions = ©Info +}; + +// 2. Perform the copy directly on the CPU +device.copyMemoryToImageEXT(hostCopyInfo); +---- + +== Use Cases and Advantages + +Host Image Copies are ideal for scenarios where you need to update a small amount of data quickly and don't want to wait for the GPU's command processor. + +- **Dynamic Textures**: Updating UI elements, font atlases, or small dynamic textures. +- **Screenshots**: Copying a GPU-optimal image back to the CPU for saving to disk (using the inverse `copyImageToMemoryEXT`). +- **Debugging**: Quickly inspecting the contents of a GPU-side resource from your CPU code. + +The primary advantage is **lower latency**. Because you aren't recording and submitting a command buffer, you eliminate all the driver and hardware overhead associated with submission. The CPU simply moves the bytes, and they are available on the GPU immediately. + +== Potential in Simple Engine + +In `Simple Engine`, we can use Host Image Copies to optimize our **Screenshot** system. Currently, taking a screenshot involves a multi-step process of recording a command buffer to copy the swapchain image to a staging buffer, submitting that command buffer, and then waiting for a fence on the CPU. This is slow and can cause a noticeable hitch in the frame rate. + +By moving to `copyImageToMemoryEXT` (available in Vulkan 1.4), we can perform the screenshot copy directly on the CPU. Once the frame is finished and the swapchain image is in the `ePresentSrcKHR` layout, we can call `copyImageToMemoryEXT` from our main thread. This moves the pixels directly from the GPU's memory into our CPU-side screenshot buffer, completely bypassing the command submission and fence-wait cycle. This results in a much smoother user experience and a cleaner, more direct implementation of the screenshot feature in our engine. + +In the next section, we'll see how to handle the synchronization required to ensure that these bytes are visible to the GPU and that we don't create any host-device hazards. + +== Navigation + +Previous: xref:Synchronization/Host_Image_Copies_Memory_Sync/01_introduction.adoc[Introduction] | Next: xref:Synchronization/Host_Image_Copies_Memory_Sync/03_visibility_flushes.adoc[Visibility & Flushes] diff --git a/en/Synchronization/Host_Image_Copies_Memory_Sync/03_visibility_flushes.adoc b/en/Synchronization/Host_Image_Copies_Memory_Sync/03_visibility_flushes.adoc new file mode 100644 index 00000000..7392e63b --- /dev/null +++ b/en/Synchronization/Host_Image_Copies_Memory_Sync/03_visibility_flushes.adoc @@ -0,0 +1,58 @@ +:pp: {plus}{plus} += Visibility & Flushes: Mastering Coherency + +== Understanding Host-Device Synchronization + +When you use **Host Image Copies**, you are essentially performing a direct memory copy between the CPU and GPU. This is highly efficient, but it introduces a new kind of synchronization challenge. We must ensure that the data we write on the host (CPU) is **visible** to the GPU before it starts using it, and that any previous GPU work is **available** before we start our host copy. + +In the world of **Synchronization 2**, we use `vk::MemoryBarrier2` to express this. We are no longer syncing two different GPU stages; we are syncing the host and the device. + +== The Host-to-Device Dependency + +The most common case is a **Host-to-Device** dependency. You write some data on the CPU and then want the GPU to read it in a shader. To do this, you use a barrier with `srcStageMask = vk::PipelineStageFlagBits2::eHost` and `dstStageMask` set to the shader stage where the image will be read. + +[,cpp] +---- +auto hostToDeviceBarrier = vk::ImageMemoryBarrier2{ + .srcStageMask = vk::PipelineStageFlagBits2::eHost, + .srcAccessMask = vk::AccessFlagBits2::eHostWrite, + .dstStageMask = vk::PipelineStageFlagBits2::eFragmentShader, + .dstAccessMask = vk::AccessFlagBits2::eShaderRead, + .oldLayout = vk::ImageLayout::eGeneral, + .newLayout = vk::ImageLayout::eShaderReadOnlyOptimal, + .image = gpuImage.image(), + .subresourceRange = subresourceRange +}; + +commandBuffer.pipelineBarrier2(vk::DependencyInfo{.imageMemoryBarrierCount = 1, .pImageMemoryBarriers = &hostToDeviceBarrier}); +---- + +The `eHost` stage mask is a special flag that tells the GPU: "This data was updated on the CPU. Please ensure that all CPU writes are visible before the fragment shader starts its read." + +== The Device-to-Host Dependency + +The inverse case is a **Device-to-Host** dependency—for example, when you take a screenshot. You must ensure that the GPU has finished its rendering before the CPU starts the host copy. To do this, you record a barrier with the appropriate GPU stages as the source and `eHost` as the destination. + +[,cpp] +---- +auto deviceToHostBarrier = vk::ImageMemoryBarrier2{ + .srcStageMask = vk::PipelineStageFlagBits2::eColorAttachmentOutput, + .srcAccessMask = vk::AccessFlagBits2::eColorAttachmentWrite, + .dstStageMask = vk::PipelineStageFlagBits2::eHost, + .dstAccessMask = vk::AccessFlagBits2::eHostRead, + .oldLayout = vk::ImageLayout::eColorAttachmentOptimal, + .newLayout = vk::ImageLayout::eGeneral, + .image = gpuImage.image(), + .subresourceRange = subresourceRange +}; + +commandBuffer.pipelineBarrier2(vk::DependencyInfo{.imageMemoryBarrierCount = 1, .pImageMemoryBarriers = &deviceToHostBarrier}); +---- + +In addition to the barrier, you must also use a **Fence** or a **Timeline Semaphore** on the CPU side to ensure that the command buffer containing the barrier has actually finished executing on the GPU before you attempt to call `device.copyImageToMemoryEXT`. + +By mastering these host-device handshakes, you can build a renderer that is both extremely fast and perfectly robust, giving you a powerful new tool for managing your engine's memory. In the final chapters of this series, we'll see how to debug and optimize these complex synchronization patterns using the latest Vulkan tools. + +== Navigation + +Previous: xref:Synchronization/Host_Image_Copies_Memory_Sync/02_cpu_to_image_access.adoc[Direct CPU-to-Image Access] | Next: xref:Synchronization/Synchronization_Validation/01_introduction.adoc[Debugging with Synchronization Validation] diff --git a/en/Synchronization/Pipeline_Barriers_Transitions/01_introduction.adoc b/en/Synchronization/Pipeline_Barriers_Transitions/01_introduction.adoc new file mode 100644 index 00000000..9830a0cf --- /dev/null +++ b/en/Synchronization/Pipeline_Barriers_Transitions/01_introduction.adoc @@ -0,0 +1,18 @@ +:pp: {plus}{plus} += Pipeline Barriers and Layout Transitions: The Core Loop + +== Introduction + +If the previous chapter was about understanding the theoretical "handshake" between GPU stages, this chapter is where we get our hands dirty with the actual implementation. In the modern Vulkan 1.3+ landscape, the `vk::ImageMemoryBarrier2` is the most common tool in our synchronization toolbox. It's how we transition images between layouts, ensure data is visible across different hardware caches, and manage the complex state changes required for high-performance rendering. + +We often think of an image as just a grid of pixels, but to the GPU, it's a sophisticated resource that can be optimized for different types of access. A layout that's great for writing as a color attachment might be terrible for sampling in a fragment shader. Managing these transitions efficiently—and only when strictly necessary—is what separates a stuttering renderer from a smooth, 60 FPS experience. + +In this chapter, we're going to dive deep into the mechanics of these barriers. We'll start with the anatomy of the image barrier itself, specifically within the context of **Dynamic Rendering**, which has largely replaced the legacy "Render Pass" system. We'll then tackle one of the most misunderstood topics in Vulkan: **Queue Family Ownership**. This is the explicit "hand-off" required when you want to move a resource, like a texture or a buffer, between the Graphics, Compute, and Transfer queues of your engine. + +Finally, we'll look at the performance implications of our choices. Vulkan gives us the option of using **Global Memory Barriers** or more specific, resource-bound barriers. We'll learn how to determine which one to use and when, so we can give the driver exactly the right amount of information to keep the hardware running at full tilt without introducing unnecessary stalls. + +Let's begin by looking at the workhorse of modern Vulkan synchronization: the Image Memory Barrier 2. + +== Navigation + +Previous: xref:Synchronization/Anatomy_of_a_Dependency/04_refined_pipeline_stages.adoc[Refined Pipeline Stages] | Next: xref:Synchronization/Pipeline_Barriers_Transitions/02_image_barrier.adoc[The Image Barrier] diff --git a/en/Synchronization/Pipeline_Barriers_Transitions/02_image_barrier.adoc b/en/Synchronization/Pipeline_Barriers_Transitions/02_image_barrier.adoc new file mode 100644 index 00000000..3b2d2dde --- /dev/null +++ b/en/Synchronization/Pipeline_Barriers_Transitions/02_image_barrier.adoc @@ -0,0 +1,120 @@ +:pp: {plus}{plus} += The Image Barrier: Implementing vk::ImageMemoryBarrier2 + +== The Core Mechanism + +In the world of modern Vulkan, the image memory barrier is the definitive tool for managing how resources flow through the pipeline. While the theory of synchronization is about "execution" and "visibility," the image barrier adds a third, equally critical component: **Layout Transitions**. Unlike a buffer, which is just a linear strip of memory, an image has a layout that determines how its texels are organized. + +If we want to write to an image as a color attachment, the GPU hardware expects it to be in `VK_IMAGE_LAYOUT_COLOR_ATTACHMENT_OPTIMAL`. If we later want to sample that same image in a shader, it must be transitioned to `VK_IMAGE_LAYOUT_SHADER_READ_ONLY_OPTIMAL`. This is not just a driver-side "flag"—on some hardware, this transition might trigger a physical reorganization of the data or a cache flush. + +image::/images/image_barrier_anatomy.svg[Anatomy of an Image Barrier] + +When we talk about "physical reorganization," we're referring to how different hardware units see the same bits. For instance, a Rasterizer might use a specialized tiled compression format (like Delta Color Compression) to save bandwidth. However, a Compute shader sampling that same image might not understand that compression. The layout transition ensures the data is "decompressed" or moved into a format that the next stage can consume. + +== Deconstructing the Image Barrier + +When we record a pipeline barrier, we are essentially defining a "gate" that the GPU must pass through. Let’s look at how we construct this using the RAII-style Vulkan-Hpp wrappers we use in our engine: + +[,cpp] +---- +auto imageBarrier = vk::ImageMemoryBarrier2{ + .srcStageMask = vk::PipelineStageFlagBits2::eColorAttachmentOutput, + .srcAccessMask = vk::AccessFlagBits2::eColorAttachmentWrite, + .dstStageMask = vk::PipelineStageFlagBits2::eFragmentShader, + .dstAccessMask = vk::AccessFlagBits2::eShaderRead, + .oldLayout = vk::ImageLayout::eColorAttachmentOptimal, + .newLayout = vk::ImageLayout::eShaderReadOnlyOptimal, + .image = renderTarget.image(), + .subresourceRange = { + .aspectMask = vk::ImageAspectFlagBits::eColor, + .baseMipLevel = 0, + .levelCount = 1, + .baseArrayLayer = 0, + .layerCount = 1 + } +}; + +auto dependencyInfo = vk::DependencyInfo{ + .imageMemoryBarrierCount = 1, + .pImageMemoryBarriers = &imageBarrier +}; + +commandBuffer.pipelineBarrier2(dependencyInfo); +---- + +In this example, we're transitioning a color attachment so it can be sampled by a subsequent fragment shader. The `srcStageMask` tells the GPU "wait for the color attachment output stage of previous commands to finish," while the `srcAccessMask` specifies that we are specifically waiting for the memory *writes* from that stage to be complete. On the other side of the gate, the `dstStageMask` and `dstAccessMask` ensure that the fragment shader stage will wait to start its read operations until the layout transition and cache flushes are finished. + +== The Power of Layout Discard + +One of the most common performance optimizations in Vulkan is the use of `vk::ImageLayout::eUndefined` as the `oldLayout`. When we set the old layout to undefined, we are telling the driver: "I don't care about what was in this image before." + +This is incredibly powerful. If the driver knows the previous content is garbage, it can skip the expensive work of preserving data during a layout transition. For example, if you're about to clear an image and use it as a fresh color attachment, transitioning from `eUndefined` to `eColorAttachmentOptimal` is significantly faster than transitioning from `eShaderReadOnlyOptimal` (which might require a "resolve" or "decompression" of the previous frame's data). + +== Subresource Ranges and Aspect Masks + +Vulkan doesn't just let us synchronize an entire image; it gives us surgical control over specific parts of it via the `subresourceRange`. This is vital for complex effects: + +* **Mipmap Generation**: We can transition mip level 0 to `eTransferSrcOptimal` and level 1 to `eTransferDstOptimal` to perform a blit, then transition them back. +* **Aspect Masks**: For depth-stencil formats, we might only want to transition the `eDepth` aspect while leaving `eStencil` alone (or vice versa). +* **Layered Rendering**: In VR or cubemap rendering, we can transition individual array layers independently to allow different parts of the GPU to work on different views simultaneously. + +== Synchronization in Dynamic Rendering + +One of the major shifts in modern Vulkan is the move toward **Dynamic Rendering** (core since Vulkan 1.3, or via extensions). In the old "Render Pass" system, transitions were often hidden within subpass dependencies or the render pass definition itself. This was often confusing and led to over-synchronization. + +With dynamic rendering, the responsibility for transitions falls squarely on us. We typically perform our transitions *between* calls to `beginRendering` and `endRendering`. This might feel like more work, but it provides far more clarity. We know exactly where the transition is happening because we recorded it explicitly. It also makes it much easier to integrate with modern engine architectures where rendering passes are more fluid and less rigid than the legacy system. + +== Putting it Together in the Engine + +In a real-world engine, you rarely emit just one barrier. You batch them. Here is how our `Renderer` might handle a common "Post-Process" sequence where we transition both the scene color and the depth buffer (for depth-of-field) before the final UI pass: + +[,cpp] +---- +std::array barriers; + +// Transition Scene Color from Attachment to Shader Read +barriers[0] = vk::ImageMemoryBarrier2{ + .srcStageMask = vk::PipelineStageFlagBits2::eColorAttachmentOutput, + .srcAccessMask = vk::AccessFlagBits2::eColorAttachmentWrite, + .dstStageMask = vk::PipelineStageFlagBits2::eFragmentShader, + .dstAccessMask = vk::AccessFlagBits2::eShaderRead, + .oldLayout = vk::ImageLayout::eColorAttachmentOptimal, + .newLayout = vk::ImageLayout::eShaderReadOnlyOptimal, + .image = sceneColor.image() +}; + +// Transition Depth from Attachment to Shader Read (Depth Aspect only!) +barriers[1] = vk::ImageMemoryBarrier2{ + .srcStageMask = vk::PipelineStageFlagBits2::eLateFragmentTests, + .srcAccessMask = vk::AccessFlagBits2::eDepthStencilAttachmentWrite, + .dstStageMask = vk::PipelineStageFlagBits2::eFragmentShader, + .dstAccessMask = vk::AccessFlagBits2::eShaderRead, + .oldLayout = vk::ImageLayout::eDepthStencilAttachmentOptimal, + .newLayout = vk::ImageLayout::eDepthReadOnlyOptimal, + .image = depthBuffer.image(), + .subresourceRange = { + .aspectMask = vk::ImageAspectFlagBits::eDepth, + .baseMipLevel = 0, + .levelCount = 1, + .baseArrayLayer = 0, + .layerCount = 1 + } +}; + +commandBuffer.pipelineBarrier2(vk::DependencyInfo{ + .imageMemoryBarrierCount = static_cast(barriers.size()), + .pImageMemoryBarriers = barriers.data() +}); +---- + +By batching these into a single `DependencyInfo`, the driver can optimize the state changes and cache flushes, ensuring the GPU spends more time drawing and less time waiting for barriers. + +== Simple Engine: The Unified Barrier + +In `Simple Engine`, we consolidate our image transitions to minimize driver overhead. If you look at `Renderer::Render` in `renderer_rendering.cpp`, you'll see how we handle the transition from the **Opaque Pass** to the **Post-Processing Pass**. We don't just transition the color buffer; we often transition the depth buffer and any auxiliary buffers (like our G-Buffer for Forward+ lighting) in a single `vk::DependencyInfo`. + +One specific trick we use in `Simple Engine` is the **Layout Tracking** system. Because our `Renderer` can switch between different rendering paths (like Rasterization vs. Ray Query), we keep track of the current layout of our main images (like `opaqueSceneColorImageLayouts`). When we begin a pass, we check the current layout and only emit a barrier if a transition is actually necessary. If the image is already in the correct layout, we skip the barrier entirely, saving precious GPU cycles. + +== Navigation + +Previous: xref:Synchronization/Pipeline_Barriers_Transitions/01_introduction.adoc[Introduction] | Next: xref:Synchronization/Pipeline_Barriers_Transitions/03_queue_family_ownership.adoc[Queue Family Ownership] diff --git a/en/Synchronization/Pipeline_Barriers_Transitions/03_queue_family_ownership.adoc b/en/Synchronization/Pipeline_Barriers_Transitions/03_queue_family_ownership.adoc new file mode 100644 index 00000000..5b9b6bcb --- /dev/null +++ b/en/Synchronization/Pipeline_Barriers_Transitions/03_queue_family_ownership.adoc @@ -0,0 +1,79 @@ +:pp: {plus}{plus} += Queue Family Ownership: The Handshake + +== Why We Transfer Ownership + +In many high-performance Vulkan engines, we don't just use a single "Graphics" queue for everything. We might use a dedicated **Transfer Queue** for background asset streaming or a **Compute Queue** for asynchronous post-processing. However, Vulkan resources (buffers and images) are generally "owned" by a specific queue family if they were created with `vk::SharingMode::eExclusive`. + +If you want to move an image from your Transfer queue (where you just uploaded it) to your Graphics queue (where you want to draw it), you must perform an explicit **Queue Family Ownership Transfer**. This is a two-step "handshake" that involves a release operation on the source queue and an acquire operation on the destination queue. + +== The Release and Acquire Handshake + +The transfer happens by recording a pipeline barrier on both queues. Crucially, both barriers must specify the source and destination queue family indices. + +=== 1. The Release Operation (Source Queue) + +On the queue that currently owns the resource, you record a barrier that "releases" it. The `srcQueueFamilyIndex` is your current queue, and the `dstQueueFamilyIndex` is the queue you are sending it to. + +[,cpp] +---- +auto releaseBarrier = vk::ImageMemoryBarrier2{ + .srcStageMask = vk::PipelineStageFlagBits2::eAllTransfer, + .srcAccessMask = vk::AccessFlagBits2::eTransferWrite, + .dstStageMask = vk::PipelineStageFlagBits2::eNone, // No stage on this queue + .dstAccessMask = vk::AccessFlagBits2::eNone, // No access on this queue + .oldLayout = vk::ImageLayout::eTransferDstOptimal, + .newLayout = vk::ImageLayout::eShaderReadOnlyOptimal, + .srcQueueFamilyIndex = transferQueueIndex, + .dstQueueFamilyIndex = graphicsQueueIndex, + .image = texture.image(), + .subresourceRange = subresourceRange +}; + +// Record on Transfer Command Buffer +transferCommandBuffer.pipelineBarrier2(vk::DependencyInfo{.imageMemoryBarrierCount = 1, .pImageMemoryBarriers = &releaseBarrier}); +---- + +=== 2. The Acquire Operation (Destination Queue) + +On the target queue, you record a barrier that "acquires" the resource. The indices remain the same, but now the `srcStageMask` and `srcAccessMask` are set to `eNone` because those stages happened on a different queue. + +[,cpp] +---- +auto acquireBarrier = vk::ImageMemoryBarrier2{ + .srcStageMask = vk::PipelineStageFlagBits2::eNone, + .srcAccessMask = vk::AccessFlagBits2::eNone, + .dstStageMask = vk::PipelineStageFlagBits2::eFragmentShader, + .dstAccessMask = vk::AccessFlagBits2::eShaderRead, + .oldLayout = vk::ImageLayout::eTransferDstOptimal, + .newLayout = vk::ImageLayout::eShaderReadOnlyOptimal, + .srcQueueFamilyIndex = transferQueueIndex, + .dstQueueFamilyIndex = graphicsQueueIndex, + .image = texture.image(), + .subresourceRange = subresourceRange +}; + +// Record on Graphics Command Buffer +graphicsCommandBuffer.pipelineBarrier2(vk::DependencyInfo{.imageMemoryBarrierCount = 1, .pImageMemoryBarriers = &acquireBarrier}); +---- + +== Orchestration with Semaphores + +Recording the barriers is only half the battle. You also need to ensure that the Graphics queue doesn't try to acquire the resource before the Transfer queue has released it. This is typically handled with a **Semaphore**. The Transfer queue signals a semaphore upon completion of its command buffer, and the Graphics queue waits on that same semaphore before executing its own acquire barrier. + +This handshake is one of the more complex parts of Vulkan synchronization, but it's essential for building a multi-threaded, non-blocking engine architecture. In modern Vulkan, we prefer **Timeline Semaphores** for this orchestration, as they allow us to track this progress with a simple monotonic counter, which we'll cover in detail in the next chapter. + +== Simple Engine: Resource Handoff + +In `Simple Engine`, we avoid the complexity of ownership transfers where possible by using `vk::SharingMode::eConcurrent` when creating our major buffers and images. If the hardware supports it, this allows multiple queue families (like our `transferQueue` and `graphicsQueue`) to access the same memory concurrently without an explicit "Release/Acquire" barrier. + +[WARNING] +==== +While `eConcurrent` is convenient, using it for **images** can result in lower performance on some hardware implementations compared to `eExclusive` with explicit ownership transfers. For buffers, the impact is generally negligible, but for high-performance image handling, the "handshake" is often preferred. +==== + +However, even with `eConcurrent`, you still need to synchronize the *execution* of those queues! In `Simple Engine`, we use a dedicated **Transfer Semaphore** to ensure that our graphics queue doesn't start sampling a texture until the transfer queue has finished its work. This is handled during the `Renderer::ProcessPendingMeshUploads` call, ensuring that all background uploads are correctly "visible" to the graphics hardware before the next frame begins. + +== Navigation + +Previous: xref:Synchronization/Pipeline_Barriers_Transitions/02_image_barrier.adoc[The Image Barrier] | Next: xref:Synchronization/Pipeline_Barriers_Transitions/04_global_vs_local_barriers.adoc[Global vs. Local Barriers] diff --git a/en/Synchronization/Pipeline_Barriers_Transitions/04_global_vs_local_barriers.adoc b/en/Synchronization/Pipeline_Barriers_Transitions/04_global_vs_local_barriers.adoc new file mode 100644 index 00000000..deb0b959 --- /dev/null +++ b/en/Synchronization/Pipeline_Barriers_Transitions/04_global_vs_local_barriers.adoc @@ -0,0 +1,65 @@ +:pp: {plus}{plus} += Global vs. Local Barriers: Precision and Performance + +== The Dilemma of Choice + +Vulkan gives us two ways to synchronize memory: **Global Memory Barriers** and **Specific Resource Barriers** (Image and Buffer barriers). It's often tempting to just use a global barrier for everything—it's simpler to write, requires less bookkeeping, and covers all your bases. However, this convenience comes at a cost. + +A global barrier affects *all* memory accesses of the specified type across the entire GPU. If you only need to transition a single texture, but you use a global memory barrier, the GPU might end up flushing its entire L1 and L2 cache, potentially stalling other unrelated work that was running perfectly fine. + +== When to Use Global Barriers + +Global barriers are not "evil"; they are simply a broad tool. They are excellent for scenarios where you are about to perform a major state change that affects many resources simultaneously. + +For example, if you are moving from a G-Buffer pass to a complex lighting pass that will read from multiple textures and buffers, a single global barrier might be more efficient than recording ten individual image and buffer barriers. Consolidating into a single global barrier reduces the driver overhead of processing the `vk::DependencyInfo` and can sometimes lead to better hardware utilization if many resources are transitioning between similar stages. + +[,cpp] +---- +auto globalBarrier = vk::MemoryBarrier2{ + .srcStageMask = vk::PipelineStageFlagBits2::eColorAttachmentOutput, + .srcAccessMask = vk::AccessFlagBits2::eColorAttachmentWrite, + .dstStageMask = vk::PipelineStageFlagBits2::eComputeShader, + .dstAccessMask = vk::AccessFlagBits2::eShaderRead +}; + +commandBuffer.pipelineBarrier2(vk::DependencyInfo{.memoryBarrierCount = 1, .pMemoryBarriers = &globalBarrier}); +---- + +== When to Use Resource Barriers + +Resource-specific barriers (`vk::ImageMemoryBarrier2` and `vk::BufferMemoryBarrier2`) are your "surgical" tools. You should use them whenever the dependency is limited to a specific resource, especially if that resource is being transitioned between layouts. + +The primary advantage of an image barrier is that it allows the driver to perform layout-specific optimizations. A global memory barrier *cannot* transition an image layout. If you need to change an image from `eColorAttachmentOptimal` to `eShaderReadOnlyOptimal`, you *must* use an image memory barrier. + +== The Golden Rule: Batching + +Whether you choose global or local barriers, the most important rule for Vulkan synchronization performance is **Batching**. + +Avoid calling `pipelineBarrier2` multiple times in a row. Every call to `pipelineBarrier2` has a non-trivial overhead. Instead, collect all your barriers (global, image, and buffer) into a single `vk::DependencyInfo` and submit them in one go. + +[,cpp] +---- +std::vector imageBarriers = { /* ... */ }; +vk::MemoryBarrier2 globalBarrier = { /* ... */ }; + +auto dependencyInfo = vk::DependencyInfo{ + .memoryBarrierCount = 1, + .pMemoryBarriers = &globalBarrier, + .imageMemoryBarrierCount = static_cast(imageBarriers.size()), + .pImageMemoryBarriers = imageBarriers.data() +}; + +commandBuffer.pipelineBarrier2(dependencyInfo); +---- + +By batching your barriers, you give the driver the opportunity to consolidate the cache flushes and stage stalls, ensuring that the GPU spends as little time as possible waiting and as much time as possible rendering. + +== Simple Engine: Optimization + +In `Simple Engine`, we primarily use **Image Memory Barriers** because most of our synchronization involves layout transitions (e.g., from `eColorAttachmentOptimal` to `eShaderReadOnlyOptimal`). However, we do use **Global Memory Barriers** in our `ComputeSystem` (e.g., in `physics_system.cpp`) when we need to ensure that all previous compute writes to any and all storage buffers are visible to subsequent shader stages. + +One area where `Simple Engine` could be further optimized is in the consolidation of these barriers. Currently, some of our systems emit their own barriers independently. In a future update, we plan to move toward a **Render Graph** architecture. This would allow the engine to collect all necessary barriers across all systems for an entire frame and batch them into a single, highly-optimized `vkCmdPipelineBarrier2` call, further reducing driver overhead and improving GPU occupancy. + +== Navigation + +Previous: xref:Synchronization/Pipeline_Barriers_Transitions/03_queue_family_ownership.adoc[Queue Family Ownership] | Next: xref:Synchronization/Timeline_Semaphores/01_introduction.adoc[Timeline Semaphores: The Master Clock] diff --git a/en/Synchronization/Profiling_Optimization/01_introduction.adoc b/en/Synchronization/Profiling_Optimization/01_introduction.adoc new file mode 100644 index 00000000..760b8abf --- /dev/null +++ b/en/Synchronization/Profiling_Optimization/01_introduction.adoc @@ -0,0 +1,30 @@ +:pp: {plus}{plus} += Profiling, Batching, and Optimization: Squeezing the GPU + +== Introduction + +Congratulations! You've mastered the core mechanics of **Synchronization 2**, the monotonic world of **Timeline Semaphores**, and the complexities of **Asynchronous Compute** and **Asset Streaming**. You've built a renderer that is robust, modern, and validated. + +But in the world of high-performance graphics, "correct" is only the beginning. The final challenge is to make your synchronization as efficient as possible. Every barrier you record and every semaphore you signal has a cost—both in terms of driver overhead and potential hardware stalls. + +In this final chapter, we're going to move beyond the "how" and "why" of synchronization and focus on the "how fast." We'll explore the advanced techniques that professional engine developers use to squeeze every last drop of performance out of the GPU. + +== The Optimization Mindset + +Optimization in synchronization is a balancing act. On one hand, you want to be as specific as possible to avoid unnecessary stalls. On the other hand, you want to minimize the number of times you call into the driver. + +The key is to think in terms of **Batching** and **Visibility**. Instead of thinking about each resource in isolation, you should think about your frame as a whole. Where can you group dependencies? Where can you move barriers to allow more work to overlap? Where can you use hardware profiling tools to find the "invisible" bottlenecks that are holding your frame rate back? + +== What We'll Explore + +In this final chapter, we'll dive into the advanced world of Vulkan optimization. We'll explore: + +1. **Barrier Batching**: How to consolidate multiple global, image, and buffer barriers into a single `vkCmdPipelineBarrier2` call to reduce driver overhead. +2. **Visualizing Stalls**: We'll revisit the "bubble" problem from Chapter 6, but this time with a focus on using hardware profilers to identify and eliminate them at scale. +3. **Final Refinements**: We'll wrap up the series with a checklist of best practices and common pitfalls to ensure your engine remains high-performance as it grows. + +By the end of this chapter, you'll have the knowledge and the tools to take your renderer from "validated" to "optimized," ensuring that your synchronization code is as fast as it is correct. + +== Navigation + +Previous: xref:Synchronization/Synchronization_Validation/03_interpreting_vuids.adoc[Interpreting VUIDs] | Next: xref:Synchronization/Profiling_Optimization/02_barrier_batching.adoc[Barrier Batching] diff --git a/en/Synchronization/Profiling_Optimization/02_barrier_batching.adoc b/en/Synchronization/Profiling_Optimization/02_barrier_batching.adoc new file mode 100644 index 00000000..29116196 --- /dev/null +++ b/en/Synchronization/Profiling_Optimization/02_barrier_batching.adoc @@ -0,0 +1,52 @@ +:pp: {plus}{plus} += Barrier Batching: Consolidating Your Synchronization + +== The Cost of a Call + +Every time you call `commandBuffer.pipelineBarrier2`, you are making a trip from your CPU code into the Vulkan driver. The driver then has to parse your `vk::DependencyInfo`, validate your stage and access masks, and then record the actual hardware instructions into the command buffer. + +If you have ten different images to transition, and you record ten individual barriers, you are performing ten driver trips. This overhead can add up, especially in a complex frame with many passes. + +== The Solution: Batching + +**Barrier Batching** is the practice of collecting all your global, image, and buffer barriers and submitting them in a single `pipelineBarrier2` call. This is one of the easiest ways to reduce the CPU overhead of your synchronization code. + +The `vk::DependencyInfo` structure is specifically designed for this. It allows you to provide an array of barriers of each type. + +[,cpp] +---- +std::vector imageBarriers = { /* ... multiple image transitions ... */ }; +vk::MemoryBarrier2 globalBarrier = { /* ... a broad memory dependency ... */ }; + +auto dependencyInfo = vk::DependencyInfo{ + .memoryBarrierCount = 1, + .pMemoryBarriers = &globalBarrier, + .imageMemoryBarrierCount = static_cast(imageBarriers.size()), + .pImageMemoryBarriers = imageBarriers.data() +}; + +// One call into the driver instead of many +commandBuffer.pipelineBarrier2(dependencyInfo); +---- + +== Hardware Benefits + +Batching is not just about reducing CPU overhead; it also provides significant benefits on the GPU. When you provide multiple barriers in a single call, the driver can consolidate the cache flushes and the pipeline stalls. + +Instead of stalling the pipeline and flushing caches five different times, the hardware can potentially do it all at once. This reduces the total time the GPU spends waiting and increases the time it spends rendering. + +== Implementation Strategy + +A good strategy for an engine is to have a "Barrier Manager" that collects barriers throughout a pass. When you reach a synchronization point—for example, at the end of a G-Buffer pass—the manager flushes all the collected barriers in a single batch. + +By thinking in terms of batches rather than individual barriers, you move toward a more "holistic" approach to synchronization, ensuring that your engine remains high-performance as you add more complexity to your renderer. In the next section, we'll see how to use profiling tools to visualize the impact of these optimizations. + +== Simple Engine: Consolidation + +In `Simple Engine`, we apply this principle of barrier batching in our `Renderer::Render` loop. For example, during the **Opaque Pass** to **Post-Processing** transition, we collect all necessary image barriers—including those for the scene color and the depth buffer—into a single `vk::DependencyInfo`. + +One optimization we plan for a future version of `Simple Engine` is to centralize this further. By implementing a "Barrier Manager" that collects barriers across all systems (Renderer, Physics, Audio), we can reduce our total number of `pipelineBarrier2` calls per frame. This is a critical part of our roadmap toward a full **Render Graph** system, where all synchronization is calculated globally for each frame, ensuring that we never emit redundant barriers and that all transitions are batched for maximum hardware performance. + +== Navigation + +Previous: xref:Synchronization/Profiling_Optimization/01_introduction.adoc[Introduction] | Next: xref:Synchronization/Profiling_Optimization/03_visualizing_stalls.adoc[Visualizing Stalls] diff --git a/en/Synchronization/Profiling_Optimization/03_visualizing_stalls.adoc b/en/Synchronization/Profiling_Optimization/03_visualizing_stalls.adoc new file mode 100644 index 00000000..9430f976 --- /dev/null +++ b/en/Synchronization/Profiling_Optimization/03_visualizing_stalls.adoc @@ -0,0 +1,35 @@ +:pp: {plus}{plus} += Visualizing Stalls: Finding Your Pipeline Bubbles + +== Hardware Profilers + +The most effective way to optimize your synchronization code is to see what the GPU is actually doing. We use hardware profilers like **NVIDIA Nsight Graphics** or **AMD Radeon GPU Profiler** to visualize the pipeline. + +In these tools, you can see a "Timeline" view that shows exactly when each part of the GPU (graphics cores, compute cores, transfer engine) is busy. A **Pipeline Bubble** is a gap in this timeline—a period where the hardware is idle because it's waiting for a dependency that hasn't been reached. + +== Identifying the Cause + +When you find a bubble, you must determine its cause. Is it a real dependency (e.g., the lighting pass waiting for the G-Buffer to finish)? Or is it an artificial stall caused by a too-conservative barrier? + +A common mistake is using `vk::PipelineStageFlagBits2::eAllCommands` for every barrier. This tells the GPU: "Stop everything until all previous commands have finished." This is a massive "sledgehammer" that can create huge bubbles. Instead, you should always use the most specific stage mask possible (e.g., `eColorAttachmentOutput`). + +== Practical Refinement + +To refine your masks, follow this process: + +1. **Spot the Bubble**: Find a gap in the timeline in your profiler. +2. **Identify the Dependency**: Look at the barrier that precedes the gap. +3. **Refine the Stage Mask**: Check if the dependency can be satisfied by an earlier stage. For example, can your shadow pass start as soon as the vertex work of the previous frame is done? +4. **Verify the Fix**: Re-run the profiler and check if the bubble has shrunk or disappeared. + +== Closing the Series + +Congratulations! You've successfully navigated the complex and powerful world of **Synchronization 2**, **Timeline Semaphores**, and **Asynchronous Overlap**. You've built a renderer that is modern, validated, and optimized. + +Synchronization is one of the most challenging parts of Vulkan, but it's also where you have the most power to differentiate your engine's performance. By applying the principles we've learned in this series—using the most specific stage masks, batching your barriers, and visualizing your stalls—you can build a professional-grade renderer that squeezes every last drop of performance out of the hardware. + +Keep profiling, keep refining, and keep building! + +== Navigation + +Previous: xref:Synchronization/Profiling_Optimization/02_barrier_batching.adoc[Barrier Batching] | Next: xref:Synchronization/introduction.adoc[Back to Introduction] diff --git a/en/Synchronization/Synchronization_Validation/01_introduction.adoc b/en/Synchronization/Synchronization_Validation/01_introduction.adoc new file mode 100644 index 00000000..6960783d --- /dev/null +++ b/en/Synchronization/Synchronization_Validation/01_introduction.adoc @@ -0,0 +1,30 @@ +:pp: {plus}{plus} += Debugging with Synchronization Validation: Finding Your Hazards + +== Introduction + +Vulkan synchronization is a "trust but verify" system. You can write what you believe is perfectly correct `vk::DependencyInfo` and `vk::SubmitInfo2` code, but the only way to be absolutely certain is to test it against the actual hardware behavior. However, synchronization bugs are notoriously difficult to find. They often manifest as subtle flickering, occasional crashes, or—worst of all—perfect behavior on your development machine and complete failure on a customer's GPU. + +This is where the **LunarG Synchronization Validation** layer comes in. It is, without a doubt, the most important tool in your Vulkan debugging arsenal. Unlike the standard validation layers that check for API usage errors, the sync validation layer tracks the state of every resource in your engine and identifies the "Read-After-Write" (RAW), "Write-After-Read" (WAR), and "Write-After-Write" (WAW) hazards that lead to data corruption. + +== The Hazards We Face + +Synchronization is essentially about managing these three types of hazards: + +1. **Read-After-Write (RAW)**: A stage tries to read a resource before a previous stage has finished writing to it. This is the most common cause of "garbage" data. +2. **Write-After-Read (WAR)**: A stage tries to write to a resource while a previous stage is still reading from it. This can lead to the previous stage reading "half-updated" data. +3. **Write-After-Write (WAW)**: Two stages try to write to the same resource simultaneously. The result is unpredictable and almost always leads to corruption. + +== What We'll Explore + +In this chapter, we'll learn how to leverage the validation layers to make our engine perfectly robust. We'll explore: + +1. **The Validation Layer**: How to configure and enable the LunarG Synchronization Validation layer within your engine's debug build. +2. **Interpreting VUIDs**: Vulkan Validation Unique Identifiers (VUIDs) can be daunting. We'll learn how to decipher these complex error messages and turn them into actionable code fixes. +3. **Identifying Hazards**: We'll see real-world examples of how the validation layer catches hazards that are nearly impossible to find through manual inspection. + +By the end of this chapter, you'll have the tools and the knowledge to ensure that your synchronization code is not just "mostly correct," but "Vulkan-validated" correct. + +== Navigation + +Previous: xref:Synchronization/Host_Image_Copies_Memory_Sync/03_visibility_flushes.adoc[Visibility & Flushes] | Next: xref:Synchronization/Synchronization_Validation/02_validation_layer.adoc[The Validation Layer] diff --git a/en/Synchronization/Synchronization_Validation/02_validation_layer.adoc b/en/Synchronization/Synchronization_Validation/02_validation_layer.adoc new file mode 100644 index 00000000..f7d5fc04 --- /dev/null +++ b/en/Synchronization/Synchronization_Validation/02_validation_layer.adoc @@ -0,0 +1,58 @@ +:pp: {plus}{plus} += The Validation Layer: Configuring Your Environment + +== Enabling Sync Validation + +The **LunarG Synchronization Validation** layer is not part of the standard `VK_LAYER_KHRONOS_validation` by default on all platforms. In many environments, you must explicitly enable it through the `vk_layer_settings.txt` file or through your engine's `vk::InstanceCreateInfo`. + +[TIP] +==== +A much easier way to manage this during development is using **vkconfig** (LunarG Vulkan Configurator). This GUI tool, included with the Vulkan SDK, allows you to globally enable or disable synchronization validation with a single click, without modifying your source code or project configuration. +==== + +To enable it via the instance, you add the `VK_VALIDATION_FEATURE_ENABLE_SYNCHRONIZATION_VALIDATION_EXT` flag to the `vk::ValidationFeaturesEXT` structure and pass it as the `pNext` of your `vk::InstanceCreateInfo`. + +[,cpp] +---- +auto syncValidationFeature = vk::ValidationFeaturesEXT{ + .enabledValidationFeatureCount = 1, + .pEnabledValidationFeatures = &vk::ValidationFeatureEnableEXT::eSynchronizationValidation +}; + +auto instanceCreateInfo = vk::InstanceCreateInfo{ + .pNext = &syncValidationFeature, + // ... +}; + +auto instance = vk::raii::Instance(context, instanceCreateInfo); +---- + +== Working with vk_layer_settings.txt + +For a more flexible approach, you can create a `vk_layer_settings.txt` file in your application's working directory. This file allows you to configure many aspects of the validation layers without recompiling your code. + +---- +# Example vk_layer_settings.txt +khronos_validation.enables = VK_VALIDATION_FEATURE_ENABLE_SYNCHRONIZATION_VALIDATION_EXT +khronos_validation.report_flags = error;warning;perf;info +---- + +== What the Layer Actually Does + +Once enabled, the sync validation layer begins tracking every resource in your engine. It keeps a record of the last stage that wrote to a resource, the stage that is currently reading from it, and the stage that is next in line. + +If it detects a situation where two stages could be accessing the same memory without a proper barrier—for example, if a fragment shader starts reading a texture before its transfer copy has finished—the layer will emit a validation error. + +It’s important to note that the sync validation layer has a **non-trivial performance overhead**. It is not meant to be left on in your production or release builds. It should be used exclusively during development and testing to catch hazards before they become bugs. + +== Simple Engine: Development Workflow + +In `Simple Engine`, we integrate Synchronization Validation directly into our debug builds. When you run the engine with the `--debug-sync` command-line flag (or enable it in `renderer_core.cpp`), we automatically add `vk::ValidationFeatureEnableEXT::eSynchronizationValidation` to our instance creation. + +This is a critical part of our development workflow. Whenever we add a new rendering pass—like our recent **Forward+ Lighting** or **Ray Query Shadows**—we run the engine with synchronization validation enabled. This allows us to catch any "Write-After-Read" (WAR) or "Read-After-Write" (RAW) hazards early, before they manifest as flickering pixels or intermittent GPU hangs. By letting the tools find these hazards for us, we can spend more time optimizing our engine and less time chasing down elusive synchronization bugs. + +In the next section, we'll see how to decipher the error messages this layer produces. + +== Navigation + +Previous: xref:Synchronization/Synchronization_Validation/01_introduction.adoc[Introduction] | Next: xref:Synchronization/Synchronization_Validation/03_interpreting_vuids.adoc[Interpreting VUIDs] diff --git a/en/Synchronization/Synchronization_Validation/03_interpreting_vuids.adoc b/en/Synchronization/Synchronization_Validation/03_interpreting_vuids.adoc new file mode 100644 index 00000000..397ec462 --- /dev/null +++ b/en/Synchronization/Synchronization_Validation/03_interpreting_vuids.adoc @@ -0,0 +1,40 @@ +:pp: {plus}{plus} += Interpreting VUIDs: Deciphering Your Hazard Errors + +== The Anatomy of a VUID + +Vulkan **Validation Unique Identifiers (VUIDs)** are the specific error codes that the validation layers emit when they find a problem. These IDs, like `VUID-VkImageMemoryBarrier2-image-01199`, are not just random numbers. They correspond to specific rules in the Vulkan specification. + +When the sync validation layer finds a hazard, it will emit an error message that looks something like this: + +---- +VALIDATION [SYNC-HAZARD-READ-AFTER-WRITE] (0x01234567) +VUID: VUID-vkCmdDraw-None-07892 +Message: Write-After-Read (WAR) hazard on Image (0x89abcdef) in VkCommandBuffer (0x12345678). + - Current Stage: VK_PIPELINE_STAGE_2_FRAGMENT_SHADER_BIT + - Current Access: VK_ACCESS_2_SHADER_READ_BIT + - Previous Stage: VK_PIPELINE_STAGE_2_COPY_BIT + - Previous Access: VK_ACCESS_2_TRANSFER_WRITE_BIT +---- + +== Deciphering the Message + +To a new developer, this message can be overwhelming. But if you break it down, it’s actually telling you exactly what’s wrong: + +1. **Hazard Type**: The `[SYNC-HAZARD-READ-AFTER-WRITE]` tag tells you the nature of the problem. In this case, a read is happening before a previous write has finished. +2. **Resource**: The message identifies the specific resource (`Image (0x89abcdef)`) and the command buffer where the hazard occurred. +3. **The Culprits**: The message lists the "Current" and "Previous" stages and access masks. In this example, the fragment shader is trying to read an image that was just being updated by a copy operation. + +== Actionable Fixes + +Once you understand what the message is telling you, the fix is usually straightforward: + +- **Add a Barrier**: If a previous stage is still writing when the current stage starts reading, you need to add a `vk::ImageMemoryBarrier2` (or a `vk::MemoryBarrier2`) between the two stages to ensure that the write is finished and visible. +- **Refine Your Stages**: If you already have a barrier, check that your `srcStageMask` and `dstStageMask` are correct. Did you wait for the correct stage? Did you use the correct access mask? +- **Check Your Submission**: If the hazard occurs between two different submissions, are you using a semaphore or a fence to coordinate them? + +By treating every VUID as a learning opportunity, you can systematically improve the quality and the performance of your synchronization code. In the final chapter, we'll see how to optimize these patterns for maximum GPU throughput. + +== Navigation + +Previous: xref:Synchronization/Synchronization_Validation/02_validation_layer.adoc[The Validation Layer] | Next: xref:Synchronization/Profiling_Optimization/01_introduction.adoc[Profiling, Batching, and Optimization] diff --git a/en/Synchronization/Timeline_Semaphores/01_introduction.adoc b/en/Synchronization/Timeline_Semaphores/01_introduction.adoc new file mode 100644 index 00000000..bee01d84 --- /dev/null +++ b/en/Synchronization/Timeline_Semaphores/01_introduction.adoc @@ -0,0 +1,33 @@ +:pp: {plus}{plus} += Timeline Semaphores: The Master Clock + +== Introduction + +For years, Vulkan developers have had to juggle two very different synchronization primitives: **Binary Semaphores** and **Fences**. Binary semaphores were used exclusively for GPU-to-GPU synchronization (e.g., waiting for an image to be ready before sampling it), while Fences were used for GPU-to-CPU synchronization (e.g., waiting for a command buffer to finish before reusing its resources). + +This split forced us to write two different sets of logic for what is essentially the same problem: "Is the work done yet?" It also led to complex "semaphore chains" that were notoriously difficult to debug. + +**Timeline Semaphores**, introduced as an extension and now a core part of Vulkan 1.2+, change everything. They provide a single, unified primitive that can handle both GPU-to-GPU and GPU-to-CPU synchronization using a simple, monotonic `uint64_t` counter. + +== The Monotonic World + +In a timeline-based system, progress is measured by a value that only ever increases. When you submit a piece of work to the GPU, you tell it: "When you finish this, set the semaphore value to 10." If another piece of work needs that data, you tell it: "Don't start until the semaphore value is at least 10." + +This simple change has profound implications for how we architect our engines: + +1. **Unified Logic**: We no longer care if the "waiter" is the CPU or the GPU. The interface is the same: we wait for a specific value. +2. **Wait-Before-Signal**: One of the most powerful features of Timeline Semaphores is that you can submit work to the GPU that waits for a value that hasn't even been reached yet. This allows us to decouple our submission logic from our execution logic. +3. **Better Debugging**: Because the value is a simple integer, we can easily log it, inspect it in a debugger, or even use it to build a visual profiler of our engine's progress. + +[NOTE] +==== +One current limitation to keep in mind: **Window System Integration (WSI)**. As of the current Vulkan specification, swapchain acquire and present operations still require legacy binary semaphores and cannot yet wait on or signal timeline semaphores directly. +==== + +In this chapter, we are going to explore how to implement Timeline Semaphores as the "master clock" of our renderer. We'll start by looking at how to replace our legacy fences and binary semaphores, then we'll dive into the implementation of the monotonic counter and the highly efficient wait-before-signal submission pattern. + +Let's begin by unifying our synchronization primitives. + +== Navigation + +Previous: xref:Synchronization/Pipeline_Barriers_Transitions/04_global_vs_local_barriers.adoc[Global vs. Local Barriers] | Next: xref:Synchronization/Timeline_Semaphores/02_unifying_sync.adoc[Unifying Synchronization] diff --git a/en/Synchronization/Timeline_Semaphores/02_unifying_sync.adoc b/en/Synchronization/Timeline_Semaphores/02_unifying_sync.adoc new file mode 100644 index 00000000..4988f445 --- /dev/null +++ b/en/Synchronization/Timeline_Semaphores/02_unifying_sync.adoc @@ -0,0 +1,73 @@ +:pp: {plus}{plus} += Unifying Synchronization: Replacing Fences and Binary Semaphores + +== The Simplification + +The most immediate benefit of moving to Timeline Semaphores is that you can effectively delete your code for handling fences and binary semaphores. Instead of maintaining separate sets of primitives, you create a single `vk::raii::Semaphore` and configure it to be a **Timeline** type. + +In the RAII context, this configuration happens through the `vk::SemaphoreTypeCreateInfo` which is passed as the `pNext` of the standard `vk::SemaphoreCreateInfo`. + +[,cpp] +---- +auto typeCreateInfo = vk::SemaphoreTypeCreateInfo{ + .semaphoreType = vk::SemaphoreType::eTimeline, + .initialValue = 0 +}; + +auto createInfo = vk::SemaphoreCreateInfo{ + .pNext = &typeCreateInfo +}; + +auto timelineSemaphore = vk::raii::Semaphore(device, createInfo); +---- + +== Handling CPU Waits + +Wait operations on the CPU, which used to require a `vk::Fence`, now use the `vk::Device::waitSemaphores` function. This function can wait for multiple semaphores simultaneously and will return as soon as all specified values have been reached. + +[,cpp] +---- +auto waitInfo = vk::SemaphoreWaitInfo{ + .semaphoreCount = 1, + .pSemaphores = &(*timelineSemaphore), + .pValues = &targetValue +}; + +// Wait for the GPU to reach targetValue (equivalent to vkWaitForFences) +auto result = device.waitSemaphores(waitInfo, timeoutInNanoseconds); +---- + +The beauty here is that we can now query the current value of the semaphore at any time using `device.getSemaphoreCounterValue`. This allows for much more flexible engine logic than the binary "is it done yet?" state of a fence. + +== Handling GPU Waits + +GPU-to-GPU synchronization, which used to require binary semaphores, now happens within the `vk::SubmitInfo2` (part of Synchronization 2). You specify the timeline semaphore and the specific value that the queue must wait for before beginning execution. + +[,cpp] +---- +auto waitSemaphoreInfo = vk::SemaphoreSubmitInfo{ + .semaphore = *timelineSemaphore, + .value = requiredValue, + .stageMask = vk::PipelineStageFlagBits2::eAllCommands +}; + +auto submitInfo = vk::SubmitInfo2{ + .waitSemaphoreInfoCount = 1, + .pWaitSemaphoreInfos = &waitSemaphoreInfo, + // ... +}; + +queue.submit2(submitInfo); +---- + +By using the same primitive for both, we eliminate the need to synchronize between fences and semaphores. The GPU signals the timeline, and both the CPU and other GPU queues can respond to that same signal by waiting for the appropriate value. + +== Simple Engine: The Roadmap to Timeline + +Currently, `Simple Engine` uses the legacy combination of `inFlightFences` (for CPU-to-GPU sync) and `imageAvailableSemaphores` / `renderFinishedSemaphores` (for GPU-to-GPU sync). This requires us to carefully manage `MAX_FRAMES_IN_FLIGHT` sets of each primitive, leading to the "ping-pong" logic you've likely seen in `Renderer::Render`. + +Our next major architectural update will replace these with a single `Renderer::frameTimeline` semaphore. This will allow us to unify our wait logic. Instead of `device.waitForFences`, we will use `device.waitSemaphores` to wait for the specific frame index value. This significantly simplifies our `Renderer::Render` function and makes the frame loop much easier to reason about, especially as we introduce more complex asynchronous tasks. + +== Navigation + +Previous: xref:Synchronization/Timeline_Semaphores/01_introduction.adoc[Introduction] | Next: xref:Synchronization/Timeline_Semaphores/03_monotonic_counter.adoc[The Monotonic Counter] diff --git a/en/Synchronization/Timeline_Semaphores/03_monotonic_counter.adoc b/en/Synchronization/Timeline_Semaphores/03_monotonic_counter.adoc new file mode 100644 index 00000000..0a09e673 --- /dev/null +++ b/en/Synchronization/Timeline_Semaphores/03_monotonic_counter.adoc @@ -0,0 +1,50 @@ +:pp: {plus}{plus} += The Monotonic Counter: Tracking Global Progress + +== Understanding the Counter + +At the heart of every timeline semaphore is a single `uint64_t` value. This value is monotonic, meaning it can only ever increase. This simple property allows us to treat the entire execution of our GPU/CPU engine as a single, unified timeline. + +When you submit a command buffer to a queue, you associate it with a signal operation on a timeline semaphore. You assign a specific value to that signal—say, `frame_index * 10 + pass_index`. As the GPU completes each pass, the semaphore value increments. + +== Tracking Progress + +Because we can query this value from the CPU at any time using `device.getSemaphoreCounterValue`, we can build much more intelligent engine logic. For example, instead of waiting for a "Render Complete" fence, we can query the timeline and see exactly which stage the GPU is currently working on. + +[,cpp] +---- +uint64_t currentValue = device.getSemaphoreCounterValue(*timelineSemaphore); +if (currentValue >= PassValues::eShadowPassComplete) { + // We can start preparing the next pass that depends on shadows +} +---- + +This is particularly useful for asynchronous resource management. You can tag resources with the timeline value at which they were last used. When you need to reuse or destroy a resource, you simply check if the current semaphore value has exceeded that tag. This eliminates the need for conservative `deviceWaitIdle()` calls, which are often the primary cause of GPU bubbles and CPU stalls. + +== Strategic Value Selection + +Choosing how to increment your timeline values is an architectural decision. A common pattern is to use a large increment for each frame (e.g., 1000) and then use small sub-increments for each major pass within that frame. + +* Frame 1: + * Start: 1000 + * Shadow Pass: 1010 + * G-Buffer Pass: 1020 + * Lighting Pass: 1030 +* Frame 2: + * Start: 2000 + * Shadow Pass: 2010 + * ... + +This numbering scheme provides plenty of "headroom" for adding new passes or sub-steps without having to re-calculate every single synchronization value in your engine. It also makes your logs much easier to read, as the frame number is clearly encoded in the timeline value. + +By treating the timeline as a "master clock," you move away from micro-managing individual dependencies and toward managing the overall state and progress of your renderer. In the next section, we'll see how this enables one of the most powerful submission patterns in Vulkan: the wait-before-signal. + +== Simple Engine: Tracking Frame Progress + +In `Simple Engine`, we will use the monotonic counter to track the progress of each system. We'll define a set of `TimelineValues` that represent major milestones in our frame. For example, our `Renderer` could signal a value like `currentFrameIndex * 10 + passOffset` to indicate that a specific rendering stage has finished. + +This becomes incredibly powerful when paired with our `MemoryPool`. Instead of using a simple "frames since destroy" counter (like we currently do in `pendingASDeletions`), we can tag each resource with the exact `TimelineValue` at which it was last used by the GPU. When the `MemoryPool` needs to reclaim memory, it can simply query the current semaphore value. If `currentValue >= resourceTagValue`, the resource is guaranteed to be safe for destruction or reuse, with no extra stalls or conservative waits required. + +== Navigation + +Previous: xref:Synchronization/Timeline_Semaphores/02_unifying_sync.adoc[Unifying Synchronization] | Next: xref:Synchronization/Timeline_Semaphores/04_wait_before_signal.adoc[Wait-Before-Signal Submission] diff --git a/en/Synchronization/Timeline_Semaphores/04_wait_before_signal.adoc b/en/Synchronization/Timeline_Semaphores/04_wait_before_signal.adoc new file mode 100644 index 00000000..6cbf22a0 --- /dev/null +++ b/en/Synchronization/Timeline_Semaphores/04_wait_before_signal.adoc @@ -0,0 +1,69 @@ +:pp: {plus}{plus} += Wait-Before-Signal Submission: Decoupling Execution + +== The Paradigm Shift + +In legacy Vulkan, you generally had to submit work in the order it was intended to execute. If Command Buffer B depended on Command Buffer A, you either had to submit A first, or submit them both in the same `vkQueueSubmit` call with a binary semaphore connecting them. + +Timeline Semaphores introduce the **Wait-Before-Signal** submission pattern. This allows you to submit Command Buffer B to the GPU *before* Command Buffer A has even been recorded, let alone submitted. You simply tell Command Buffer B to wait for a specific value on a timeline semaphore. As long as Command Buffer A (or some other process) eventually signals that value, the GPU will correctly manage the dependency. + +== How It Works + +This pattern works because Vulkan separates the **submission** of work from the **execution** of work. When you call `queue.submit2`, the driver simply adds your commands to the queue's internal buffer. The hardware's command processor then monitors the specified timeline semaphores. It will not begin executing the commands until all the "wait" values have been reached. + +[,cpp] +---- +// Submit the "Waiter" first! +auto waitInfo = vk::SemaphoreSubmitInfo{ + .semaphore = *timelineSemaphore, + .value = 10, + .stageMask = vk::PipelineStageFlagBits2::eAllCommands +}; + +auto submitWaiter = vk::SubmitInfo2{ + .waitSemaphoreInfoCount = 1, + .pWaitSemaphoreInfos = &waitInfo, + .commandBufferInfoCount = 1, + .pCommandBufferInfos = &waitCommandBufferInfo +}; + +graphicsQueue.submit2(submitWaiter); + +// ... Later, perhaps in a different thread or even a different frame ... + +// Submit the "Signaler" +auto signalInfo = vk::SemaphoreSubmitInfo{ + .semaphore = *timelineSemaphore, + .value = 10, + .stageMask = vk::PipelineStageFlagBits2::eAllCommands +}; + +auto submitSignaler = vk::SubmitInfo2{ + .signalSemaphoreInfoCount = 1, + .pSignalSemaphoreInfos = &signalInfo, + .commandBufferInfoCount = 1, + .pCommandBufferInfos = &signalCommandBufferInfo +}; + +transferQueue.submit2(submitSignaler); +---- + +== Why This Matters + +This decoupling is a game-changer for modern, multi-threaded engine architectures. + +1. **Reduced CPU Latency**: Your main thread can submit all its work to the GPU as soon as the command buffers are recorded, without waiting for background threads (like an asset loader or a physics engine) to finish their work. +2. **Asynchronous Overlap**: It makes it much easier to implement overlapping passes. For example, your GPU can start its geometry pass while the CPU is still finishing the recording of the post-processing pass, as long as the post-processing pass waits for the geometry timeline value. +3. **Simplified Architecture**: You can build your submission logic around the "needs" of each pass, rather than worrying about the strict ordering of API calls. + +Wait-before-signal is the final piece of the puzzle for a truly modern Vulkan renderer. By combining the precision of Synchronization 2 with the flexibility of Timeline Semaphores, you can build an engine that is both easier to reason about and capable of squeezing every last drop of performance out of the hardware. + +== Simple Engine: Non-Blocking Submission + +In `Simple Engine`, we will use this pattern to decouple our `PhysicsSystem` from our `Renderer`. Currently, the renderer must wait for the physics simulation to finish on the CPU before it can even *record* its command buffers. This creates a massive CPU stall every frame. + +With wait-before-signal, our `Renderer` will simply record its commands to wait for the `physicsTimeline` to reach a specific value (e.g., `currentFrameIndex`). It can then submit those commands immediately to the `graphicsQueue`. Even if the `PhysicsSystem` hasn't finished its simulation on the `computeQueue` yet, the GPU will correctly wait at the beginning of the frame's rendering. This allows the CPU to move on to other tasks (like audio processing or input handling) while the GPU is efficiently managing the dependency itself. + +== Navigation + +Previous: xref:Synchronization/Timeline_Semaphores/03_monotonic_counter.adoc[The Monotonic Counter] | Next: xref:Synchronization/Frame_in_Flight/01_introduction.adoc[Frame-in-Flight Architecture] diff --git a/en/Synchronization/Transfer_Queues_Streaming/01_introduction.adoc b/en/Synchronization/Transfer_Queues_Streaming/01_introduction.adoc new file mode 100644 index 00000000..86a6fa6b --- /dev/null +++ b/en/Synchronization/Transfer_Queues_Streaming/01_introduction.adoc @@ -0,0 +1,32 @@ +:pp: {plus}{plus} += Transfer Queues & Asset Streaming Sync: Non-Blocking Uploads + +== Introduction + +In a modern, open-world game or a complex architectural visualization, we can't afford to load all our assets upfront. We need to stream textures, meshes, and animation data in the background as the player moves through the world. If we do this on the main graphics queue, we risk introducing "stutters" (dropped frames) every time we submit a large upload. + +The solution is to use a **Dedicated Transfer Queue**. Most modern GPUs have a specialized engine designed specifically for moving data from CPU-visible staging buffers to GPU-optimal memory. This engine can run completely independently of the graphics and compute units, allowing us to stream gigabytes of data without affecting the frame rate. + +== The Staging Pipeline + +Asset streaming is a multi-step process. First, the CPU maps a **Staging Buffer** and writes the raw data (like a PNG or a mesh file). Then, the transfer queue is used to copy that data into a **GPU-Optimal Buffer or Image**. Finally, the graphics queue is notified that the data is ready so it can begin using it in a shader. + +The challenge, as always, is synchronization. We must ensure that: + +1. **CPU to Transfer**: The transfer queue doesn't start copying until the CPU has finished writing to the staging buffer. +2. **Transfer to GPU**: The transfer operation is complete and the data is visible in GPU memory. +3. **Transfer to Graphics**: The graphics queue doesn't try to sample the texture until the transfer queue has finished its work and, if necessary, released ownership of the resource. + +== What We'll Build + +In this chapter, we will implement a robust, non-blocking asset streaming system. We'll explore: + +1. **Non-Blocking Data Uploads**: How to utilize a dedicated transfer queue for background texture and buffer streaming. +2. **Staging Synchronization**: Coordinating **Timeline Semaphores** to ensure the graphics queue waits for the transfer to complete before sampling new data. +3. **Ownership Handshakes**: Implementing the queue family ownership transfers we learned about in Chapter 3, but in the context of a background streaming system. + +By the end of this chapter, you'll have a streaming architecture that allows your engine to load massive amounts of data in the background while maintaining a perfectly smooth, stutter-free frame rate. + +== Navigation + +Previous: xref:Synchronization/Async_Compute_Overlap/04_bubble_problem.adoc[The Bubble Problem] | Next: xref:Synchronization/Transfer_Queues_Streaming/02_non_blocking_uploads.adoc[Non-Blocking Data Uploads] diff --git a/en/Synchronization/Transfer_Queues_Streaming/02_non_blocking_uploads.adoc b/en/Synchronization/Transfer_Queues_Streaming/02_non_blocking_uploads.adoc new file mode 100644 index 00000000..ef608e55 --- /dev/null +++ b/en/Synchronization/Transfer_Queues_Streaming/02_non_blocking_uploads.adoc @@ -0,0 +1,98 @@ +:pp: {plus}{plus} += Non-Blocking Data Uploads: Utilizing the Dedicated Transfer Queue + +== Why Use a Dedicated Queue? + +In a simple Vulkan application, we might use the same queue for graphics, compute, and transfer work. This is easy to implement, but it's not efficient. Every time we submit a large transfer, the graphics queue has to stop what it's doing and wait for the transfer engine to finish. This creates a "stutter" in our frame rate. + +=== Identifying the Transfer Queue + +To get a truly asynchronous transfer queue, we look for a queue family that supports `vk::QueueFlagBits::eTransfer` but ideally does NOT support `vk::QueueFlagBits::eGraphics` or `vk::QueueFlagBits::eCompute`. This ensures the hardware has a dedicated DMA engine for memory copies that doesn't share resources with the main processing units. + +Here is how we identify these dedicated transfer families: + +[,cpp] +---- +uint32_t transferQueueFamilyIndex = std::numeric_limits::max(); +auto queueFamilies = physicalDevice.getQueueFamilyProperties(); + +for (uint32_t i = 0; i < queueFamilies.size(); ++i) { + // Look for a family that is dedicated to transfer + if ((queueFamilies[i].queueFlags & vk::QueueFlagBits::eTransfer) && + !(queueFamilies[i].queueFlags & vk::QueueFlagBits::eGraphics) && + !(queueFamilies[i].queueFlags & vk::QueueFlagBits::eCompute)) { + transferQueueFamilyIndex = i; + break; + } +} + +// Fallback: use any family that supports transfer +if (transferQueueFamilyIndex == std::numeric_limits::max()) { + for (uint32_t i = 0; i < queueFamilies.size(); ++i) { + if (queueFamilies[i].queueFlags & vk::QueueFlagBits::eTransfer) { + transferQueueFamilyIndex = i; + break; + } + } +} +---- + +By using a **Dedicated Transfer Queue**, we can perform these uploads in the background. The transfer engine is a specialized piece of hardware that can move data between memory locations without using the GPU's compute or graphics cores. By offloading these tasks, we can keep our main rendering pipeline running at full speed. + +== Implementing the Transfer + +When we use a dedicated transfer queue, we must be careful with how we record and submit our command buffers. We typically use a specialized **Transfer Command Pool** that is tied to our transfer queue family. + +[,cpp] +---- +// Record a transfer command buffer +auto cmd = vk::raii::CommandBuffer(device, { .commandPool = *transferPool, .level = vk::CommandBufferLevel::ePrimary }); +cmd.begin({ .flags = vk::CommandBufferUsageFlagBits::eOneTimeSubmit }); + +// Copy from staging buffer to GPU-optimal image +auto region = vk::BufferImageCopy{ + .bufferOffset = 0, + .imageSubresource = { .aspectMask = vk::ImageAspectFlagBits::eColor, .mipLevel = 0, .baseArrayLayer = 0, .layerCount = 1 }, + .imageExtent = extent +}; +cmd.copyBufferToImage(*stagingBuffer, *gpuImage, vk::ImageLayout::eTransferDstOptimal, region); + +cmd.end(); +---- + +== Submitting for Parallel Execution + +The key to non-blocking uploads is submitting our transfer work to the transfer queue *independently* of our main graphics loop. We don't want our CPU to wait for the transfer to finish. Instead, we use a **Timeline Semaphore** to signal when the transfer is complete. + +[,cpp] +---- +// On the background thread +auto signalInfo = vk::SemaphoreSubmitInfo{ + .semaphore = *transferTimeline, + .value = nextTransferValue++, + .stageMask = vk::PipelineStageFlagBits2::eAllTransfer +}; + +auto submit = vk::SubmitInfo2{ + .commandBufferInfoCount = 1, + .pCommandBufferInfos = &cmdInfo, + .signalSemaphoreInfoCount = 1, + .pSignalSemaphoreInfos = &signalInfo +}; + +transferQueue.submit2(submit); +---- + +Because we are using a dedicated queue, the GPU can process this transfer while it is simultaneously rendering frame N or frame N+1 on its graphics queue. There is no contention for the command processor or the shader units. + +== Simple Engine: The Streaming Thread + +In `Simple Engine`, we have a dedicated `LoadingThread` that handles the background loading and uploading of textures. This thread uses a separate `vk::raii::CommandPool` and a dedicated `transferQueue` (if available on the hardware). When a new texture needs to be uploaded, the loading thread records its own transfer commands and submits them to the `transferQueue` independently of the main rendering loop. + +This architecture ensures that our frame rates remain smooth even when loading large new areas of the Bistro scene. The main `Renderer::Render` function is never blocked by the transfer engine. Instead, the renderer only needs to check the status of the `transferTimeline` before it can start using the new texture. This is a much more scalable and responsive approach than the traditional "stop-the-world" loading screen, and it's a key part of how `Simple Engine` achieves high performance on a wide range of hardware. + +In the next section, we'll see how to coordinate the synchronization to ensure that the graphics queue waits for the transfer to finish before trying to sample the newly uploaded data. + +== Navigation + +Previous: xref:Synchronization/Transfer_Queues_Streaming/01_introduction.adoc[Introduction] | Next: xref:Synchronization/Transfer_Queues_Streaming/03_staging_sync.adoc[Staging Synchronization] diff --git a/en/Synchronization/Transfer_Queues_Streaming/03_staging_sync.adoc b/en/Synchronization/Transfer_Queues_Streaming/03_staging_sync.adoc new file mode 100644 index 00000000..dc0103c9 --- /dev/null +++ b/en/Synchronization/Transfer_Queues_Streaming/03_staging_sync.adoc @@ -0,0 +1,72 @@ +:pp: {plus}{plus} += Staging Synchronization: Coordinating Graphics and Transfer + +== The Handshake + +Once your background transfer queue has finished copying a new asset to GPU-optimal memory, you need a way to tell the graphics queue that the data is ready to be sampled. This is where the **Timeline Semaphore** and **Queue Family Ownership Transfer** come together. + +The coordination follows a simple three-step process: + +1. **Transfer Release**: The transfer queue performs its copy and then records a pipeline barrier to "release" ownership of the resource (as we learned in Chapter 3). +2. **Semaphore Signal**: The transfer queue signals a specific value on a timeline semaphore when its work is finished. +3. **Graphics Acquire**: The graphics queue waits for that same semaphore value and then records its own pipeline barrier to "acquire" ownership of the resource. + +== Coordinating the Handshake + +The beauty of the **Wait-Before-Signal** pattern we learned in Chapter 4 is that the graphics queue can be submitted *before* the transfer has even finished. As long as the graphics queue waits for the correct timeline value, the hardware will ensure the transfer completes first. + +[,cpp] +---- +// Graphics Queue Submission (Wait for Transfer) +auto waitInfo = vk::SemaphoreSubmitInfo{ + .semaphore = *transferTimeline, + .value = upload_complete_value, + .stageMask = vk::PipelineStageFlagBits2::eFragmentShader +}; + +auto acquireBarrier = vk::ImageMemoryBarrier2{ + .srcStageMask = vk::PipelineStageFlagBits2::eNone, + .srcAccessMask = vk::AccessFlagBits2::eNone, + .dstStageMask = vk::PipelineStageFlagBits2::eFragmentShader, + .dstAccessMask = vk::AccessFlagBits2::eShaderRead, + .oldLayout = vk::ImageLayout::eTransferDstOptimal, + .newLayout = vk::ImageLayout::eShaderReadOnlyOptimal, + .srcQueueFamilyIndex = transferQueueIndex, + .dstQueueFamilyIndex = graphicsQueueIndex, + .image = newTexture.image(), + .subresourceRange = subresourceRange +}; + +// Record and submit the acquire barrier on the graphics queue +graphicsCommandBuffer.pipelineBarrier2(vk::DependencyInfo{.imageMemoryBarrierCount = 1, .pImageMemoryBarriers = &acquireBarrier}); + +auto submit = vk::SubmitInfo2{ + .waitSemaphoreInfoCount = 1, + .pWaitSemaphoreInfos = &waitInfo, + // ... +}; + +graphicsQueue.submit2(submit); +---- + +== Handling Resource Lifetimes + +When you stream assets in the background, you must also be careful with the lifetime of your **Staging Buffer**. You cannot reuse or destroy the staging buffer until the transfer queue has finished copying from it. This is another area where the timeline semaphore is invaluable. By tagging each staging allocation with its completion value, you can build a simple "garbage collector" that reclaims memory only when it's safe to do so. + +By coordinating your staging synchronization with your timeline semaphores, you can build an engine that is both high-performance and extremely robust. In the next chapter, we'll see how these same principles apply to the modern world of **Dynamic Rendering**, where traditional subpass dependencies have been replaced by these explicit synchronization patterns. + +== Coordinating in Simple Engine + +In `Simple Engine`, this coordination happens in the `Renderer::ProcessPendingMeshUploads` method. When the `LoadingThread` has finished a transfer, it adds the new mesh/texture to a thread-safe queue. The main renderer then checks this queue once per frame. + +For each new asset, the renderer: + +1. **Gets the Timeline Value**: It retrieves the `uploadTimelineValue` that the loading thread signaled for this specific upload. +2. **Records the Acquire Barrier**: It records a `vk::ImageMemoryBarrier2` on the main graphics command buffer. Since we use `eConcurrent` for our images, this barrier primarily handles the layout transition from `eTransferDstOptimal` to `eShaderReadOnlyOptimal` and ensures the data is invalidated in the graphics caches. +3. **Waits for Completion**: It adds a `vk::SemaphoreSubmitInfo` to the main frame submission. This tells the `graphicsQueue` to wait until the `uploadTimeline` has reached the `uploadTimelineValue` before it can start executing the fragment shaders that sample the new texture. + +This robust handshake ensures that `Simple Engine` never tries to draw a texture that is still being copied, even if the background thread is running on a different CPU core and submitting to a different GPU queue. + +== Navigation + +Previous: xref:Synchronization/Transfer_Queues_Streaming/02_non_blocking_uploads.adoc[Non-Blocking Data Uploads] | Next: xref:Synchronization/Dynamic_Rendering_Sync/01_introduction.adoc[Synchronization in Dynamic Rendering] diff --git a/en/Synchronization/introduction.adoc b/en/Synchronization/introduction.adoc new file mode 100644 index 00000000..bc3af97f --- /dev/null +++ b/en/Synchronization/introduction.adoc @@ -0,0 +1,59 @@ +:pp: {plus}{plus} += Synchronization 2: Mastering the GPU/CPU Handshake + +== Introduction + +Welcome to the **Synchronization 2** tutorial series! If you've spent any significant time with Vulkan, you've likely encountered the "Sync Wall." It's that moment when your code runs perfectly on your development machine but flickers on another, or when you realize that your high-performance GPU is spending half its time waiting for a single, overly conservative barrier. + +Synchronization is arguably the most challenging part of Vulkan, but it’s also the most powerful. It is the language we use to tell the hardware exactly how data flows through the pipeline. In this series, we are going to move beyond the legacy Vulkan 1.0 synchronization systems—those fragmented bitmasks and binary semaphores—and embrace the modern standard: **Synchronization 2** and **Timeline Semaphores**. + +=== Why a New System? + +Vulkan 1.0 synchronization was a breakthrough in control, but it was/is notoriously difficult to work with or understand. The original pipeline barriers were split across multiple structures, and the stage masks often felt like they were designed for the hardware of a decade ago; because well, it was a decade ago when it was designed. Vulkan is 10 years old at the time of writing, and modern techniques along with modern hardware have enabled some better ways while maintaining the same level of control. + +Synchronization 2, which arrived as an extension and is now a core part of Vulkan 1.3, simplifies this landscape by unifying everything into the `vk::DependencyInfo` structure. It provides a clearer, more intuitive way to define dependencies, using 64-bit masks that can target modern hardware units—like task and mesh shaders—with surgical precision. + +When we combine this with **Timeline Semaphores**, we move from a world of "binary" signals (on or off) to a world of monotonic counters. This allows us to treat the entire GPU/CPU execution as a single, unified timeline, drastically simplifying how we manage multiple frames in flight and asynchronous work. + +=== What You'll Learn + +This isn't just a list of API calls. We are going to build an engine-grade synchronization architecture. Throughout this series, we will: + +1. **Deconstruct the Dependency**: We'll look under the hood at how GPUs actually handle memory and why an "execution dependency" alone isn't enough to prevent data corruption. +2. **Master the New Barrier**: You'll learn how to use `vk::DependencyInfo` to replace legacy barriers, making your code cleaner and more performant. +3. **Harness the Timeline**: We'll implement Timeline Semaphores as the "master clock" of our engine, replacing fences and binary semaphores with a more robust monotonic counter. +4. **Architect for Concurrency**: We'll rebuild the main engine loop to handle multiple frames in flight and implement asynchronous compute and transfer operations that overlap with your main graphics work. +5. **Leverage Modern Vulkan**: We'll dive into Vulkan 1.4 features, including **Host Image Copies** and tile-local reads in **Dynamic Rendering**, to stay on the cutting edge of the API. + +=== Prerequisites + +This series is designed as an "Advanced Topic" that builds directly on the foundations established in our main Vulkan tutorial. We assume you are comfortable with: + +* The basic Vulkan rendering loop (Command Buffers, Pipelines, and Descriptor Sets). +* Modern c{pp} (RAII, smart pointers, and basic templates). +* The fundamental concepts of graphics pipelines (Vertex/Fragment stages). + +If you’re new to Vulkan, we strongly recommend completing the xref:00_Introduction.adoc[main tutorial] first. For those following along with our engine-building journey, this series perfectly complements the xref:Building_a_Simple_Engine/introduction.adoc[Building a Simple Engine] tutorial by providing the deep-dive synchronization knowledge required for a truly professional-grade renderer. + +=== A Note on Tooling + +In this series, we will be using **Slang** for all our shader examples. Slang’s productivity features and its ability to target Vulkan spir-v naturally make it the perfect companion for modern synchronization. We’ll also lean heavily on the **LunarG Synchronization Validation** layer—your best friend when it comes to identifying the "Write-After-Read" (WAR) and "Read-After-Write" (RAW) hazards that can be so hard to track down manually. + +Let's begin by tearing down a dependency to see what it's really made of. + +== Chapters in this series + +1. xref:Synchronization/Anatomy_of_a_Dependency/01_introduction.adoc[The Anatomy of a Dependency] - Understanding the core mechanics of how data moves through the pipeline. +2. xref:Synchronization/Pipeline_Barriers_Transitions/01_introduction.adoc[Pipeline Barriers and Image Layout Transitions] - Mastering the new barrier system. +3. xref:Synchronization/Timeline_Semaphores/01_introduction.adoc[Timeline Semaphores: The Master Clock] - Moving to a monotonic world. +4. xref:Synchronization/Frame_in_Flight/01_introduction.adoc[Frame-in-Flight Architecture] - Building the heartbeat of your engine. +5. xref:Synchronization/Async_Compute_Overlap/01_introduction.adoc[Asynchronous Compute & Execution Overlap] - Parallelizing your GPU work. +6. xref:Synchronization/Transfer_Queues_Streaming/01_introduction.adoc[Transfer Queues & Asset Streaming Sync] - Streaming assets without the stutter. +7. xref:Synchronization/Dynamic_Rendering_Sync/01_introduction.adoc[Synchronization in Dynamic Rendering] - Modern sync in a pass-less world. +8. xref:Synchronization/Host_Image_Copies_Memory_Sync/01_introduction.adoc[Host Image Copies & Memory Mapped Sync] - Direct CPU-to-GPU memory management. +9. xref:Synchronization/Synchronization_Validation/01_introduction.adoc[Debugging with Synchronization Validation] - Letting the tools find your hazards. +10. xref:Synchronization/Profiling_Optimization/01_introduction.adoc[Profiling, Batching, and Optimization] - Squeezing out every last millisecond. + +== Navigation + +Next: xref:Synchronization/Anatomy_of_a_Dependency/01_introduction.adoc[The Anatomy of a Dependency] diff --git a/images/image_barrier_anatomy.svg b/images/image_barrier_anatomy.svg new file mode 100644 index 00000000..9a8abc23 --- /dev/null +++ b/images/image_barrier_anatomy.svg @@ -0,0 +1,38 @@ + + + + + + + Source State + Stage: Color Attachment + Access: Write + Layout: Color Attachment + + + + vk::ImageMemoryBarrier2 + + + + Transition + + + + Destination State + Stage: Fragment Shader + Access: Shader Read + Layout: Shader Read Only + + + + + + + + + + Execution Wait + + Memory Visibility + + Layout Transition + diff --git a/images/sync2_problem_over_sync.svg b/images/sync2_problem_over_sync.svg new file mode 100644 index 00000000..75c05e0b --- /dev/null +++ b/images/sync2_problem_over_sync.svg @@ -0,0 +1,44 @@ + + + + + + Legacy Vulkan 1.0: The "Log Jam" Problem + + + + TRANSFER + + + COMPUTE + + + + BARRIER + (Global Masks) + + + + VERTEX + + + FRAGMENT + + + + + + + + + + Wait for Compute too! + Wait for Transfer too! + + + + + + + + diff --git a/images/sync2_solution_granular.svg b/images/sync2_solution_granular.svg new file mode 100644 index 00000000..4a880495 --- /dev/null +++ b/images/sync2_solution_granular.svg @@ -0,0 +1,55 @@ + + + + + + Synchronization 2: Granular and Parallel + + + + TRANSFER + + + COMPUTE + + + + BARRIER 1 + + + BARRIER 2 + + + + VERTEX + + + FRAGMENT + + + + + + + + + + Wait ONLY for Transfer + Wait ONLY for Compute + + + + + + + + + + + + + + + + +