diff --git a/src/convert.cpp b/src/convert.cpp index 7cae8df0f..cc1cdd7e2 100644 --- a/src/convert.cpp +++ b/src/convert.cpp @@ -8,7 +8,7 @@ #include "model_io/safetensors_io.h" #include "util.h" -#include "ggml-cpu.h" +#include "ggml_extend_backend.h" static ggml_type get_export_tensor_type(ModelLoader& model_loader, const TensorStorage& tensor_storage, @@ -103,7 +103,7 @@ bool convert(const char* input_path, bool output_is_safetensors = ends_with(output_path, ".safetensors"); TensorTypeRules type_rules = parse_tensor_type_rules(tensor_type_rules); - auto backend = ggml_backend_cpu_init(); + auto backend = sd_backend_cpu_init(); size_t mem_size = 1 * 1024 * 1024; // for padding mem_size += model_loader.get_tensor_storage_map().size() * ggml_tensor_overhead(); mem_size += model_loader.get_params_mem_size(backend, type); diff --git a/src/flux.hpp b/src/flux.hpp index 3f8c6c882..15a3f9228 100644 --- a/src/flux.hpp +++ b/src/flux.hpp @@ -1567,7 +1567,7 @@ namespace Flux { static void load_from_file_and_test(const std::string& file_path) { // ggml_backend_t backend = ggml_backend_cuda_init(0); - ggml_backend_t backend = ggml_backend_cpu_init(); + ggml_backend_t backend = sd_backend_cpu_init(); ggml_type model_data_type = GGML_TYPE_COUNT; ModelLoader model_loader; diff --git a/src/ggml_extend.hpp b/src/ggml_extend.hpp index 2cdfd5713..b018852a3 100644 --- a/src/ggml_extend.hpp +++ b/src/ggml_extend.hpp @@ -1442,7 +1442,7 @@ __STATIC_INLINE__ ggml_tensor* ggml_ext_group_norm(ggml_context* ctx, __STATIC_INLINE__ void ggml_ext_backend_tensor_get_and_sync(ggml_backend_t backend, const ggml_tensor* tensor, void* data, size_t offset, size_t size) { if ((sd_backend_is(backend, "ROCm") || sd_backend_is(backend, "CUDA") || sd_backend_is(backend, "SYCL")) && - !ggml_backend_is_cpu(backend)) { + !sd_backend_is_cpu(backend)) { ggml_backend_tensor_get_async(backend, tensor, data, offset, size); ggml_backend_synchronize(backend); return; @@ -1899,7 +1899,7 @@ struct GGMLRunner { LOG_DEBUG("%s compute buffer size: %.2f MB(%s)", get_desc().c_str(), compute_buffer_size / 1024.0 / 1024.0, - ggml_backend_is_cpu(runtime_backend) ? "RAM" : "VRAM"); + sd_backend_is_cpu(runtime_backend) ? "RAM" : "VRAM"); return true; } @@ -1986,7 +1986,7 @@ struct GGMLRunner { LOG_DEBUG("%s cache backend buffer size = % 6.2f MB(%s) (%i tensors)", get_desc().c_str(), cache_buffer_size / (1024.f * 1024.f), - ggml_backend_is_cpu(runtime_backend) ? "RAM" : "VRAM", + sd_backend_is_cpu(runtime_backend) ? "RAM" : "VRAM", num_tensors); if (old_cache_buffer != nullptr) { ggml_backend_buffer_free(old_cache_buffer); @@ -2293,13 +2293,13 @@ struct GGMLRunner { max_graph_vram_bytes > 0 && plan.segments.size() > 1 && params_backend != runtime_backend && - !ggml_backend_is_cpu(runtime_backend); + !sd_backend_is_cpu(runtime_backend); } bool can_attempt_graph_cut_segmented_compute() const { return max_graph_vram_bytes > 0 && params_backend != runtime_backend && - !ggml_backend_is_cpu(runtime_backend); + !sd_backend_is_cpu(runtime_backend); } bool resolve_graph_cut_plan(ggml_cgraph* gf, @@ -2436,8 +2436,8 @@ struct GGMLRunner { int64_t t_copy_begin = ggml_time_ms(); copy_data_to_backend_tensor(gf, !preserve_backend_tensor_data_map); int64_t t_copy_end = ggml_time_ms(); - if (ggml_backend_is_cpu(runtime_backend)) { - ggml_backend_cpu_set_n_threads(runtime_backend, n_threads); + if (sd_backend_is_cpu(runtime_backend)) { + sd_backend_cpu_set_n_threads(runtime_backend, n_threads); } int64_t t_compute_begin = ggml_time_ms(); @@ -2679,7 +2679,7 @@ struct GGMLRunner { LOG_DEBUG("%s params backend buffer size = % 6.2f MB(%s) (%i tensors)", get_desc().c_str(), params_buffer_size / (1024.f * 1024.f), - ggml_backend_is_cpu(params_backend) ? "RAM" : "VRAM", + sd_backend_is_cpu(params_backend) ? "RAM" : "VRAM", num_tensors); return true; } @@ -2746,7 +2746,7 @@ struct GGMLRunner { return nullptr; } // it's performing a compute, check if backend isn't cpu - if (!ggml_backend_is_cpu(runtime_backend) && (tensor->buffer == nullptr || ggml_backend_buffer_is_host(tensor->buffer))) { + if (!sd_backend_is_cpu(runtime_backend) && (tensor->buffer == nullptr || ggml_backend_buffer_is_host(tensor->buffer))) { // pass input tensors to gpu memory auto backend_tensor = ggml_dup_tensor(compute_ctx, tensor); diff --git a/src/ggml_extend_backend.cpp b/src/ggml_extend_backend.cpp index 4bf8268e5..5e95eae5a 100644 --- a/src/ggml_extend_backend.cpp +++ b/src/ggml_extend_backend.cpp @@ -8,6 +8,7 @@ #include #include +#include "stable-diffusion.h" #include "util.h" static std::string trim_copy(const std::string& value) { @@ -300,6 +301,61 @@ static ggml_backend_t init_named_backend(const std::string& name) { return ggml_backend_init_by_name(resolved.c_str(), nullptr); } +bool sd_backend_is_cpu(ggml_backend_t backend) { + if (backend == nullptr) { + return false; + } + auto dev = ggml_backend_get_device(backend); + return dev != nullptr && ggml_backend_dev_type(dev) == GGML_BACKEND_DEVICE_TYPE_CPU; +} + +ggml_backend_t sd_backend_cpu_init() { + ggml_backend_load_all_once(); + return ggml_backend_init_by_type(GGML_BACKEND_DEVICE_TYPE_CPU, nullptr); +} + +bool sd_backend_cpu_set_n_threads(ggml_backend_t backend, int n_threads) { + if (backend == nullptr) { + return false; + } + auto dev = ggml_backend_get_device(backend); + if (dev != nullptr && ggml_backend_dev_type(dev) == GGML_BACKEND_DEVICE_TYPE_CPU) { + auto reg = ggml_backend_dev_backend_reg(dev); + auto ggml_backend_set_n_threads_fn = (ggml_backend_set_n_threads_t)ggml_backend_reg_get_proc_address(reg, "ggml_backend_set_n_threads"); + if (ggml_backend_set_n_threads_fn != nullptr) { + ggml_backend_set_n_threads_fn(backend, n_threads); + return true; + } + } + return false; +} + +const char* sd_get_system_info() { + static std::string cache_info = []() -> std::string { + ggml_backend_load_all_once(); + std::stringstream ss; + ss << "System Info: \n"; + auto dev = ggml_backend_dev_by_type(GGML_BACKEND_DEVICE_TYPE_CPU); + if (dev != nullptr) { + auto reg = ggml_backend_dev_backend_reg(dev); + auto ggml_backend_get_features_fn = (ggml_backend_get_features_t)ggml_backend_reg_get_proc_address(reg, "ggml_backend_get_features"); + if (ggml_backend_get_features_fn != nullptr) { + ggml_backend_feature* feat = ggml_backend_get_features_fn(reg); + while (feat->name && feat->value) { + ss << " " << feat->name << " = " << feat->value << " | "; + feat++; + } + } else { + LOG_WARN("unable to get CPU features"); + } + } else { + LOG_WARN("unable to get CPU features"); + } + return ss.str(); + }(); + return cache_info.c_str(); +} + static ggml_backend_t sd_get_default_backend() { ggml_backend_load_all_once(); static std::once_flag once; @@ -349,10 +405,10 @@ static ggml_backend_t sd_get_default_backend() { if (!backend) { LOG_WARN("loading CPU backend"); - backend = ggml_backend_cpu_init(); + backend = sd_backend_cpu_init(); } - if (ggml_backend_is_cpu(backend)) { + if (sd_backend_is_cpu(backend)) { LOG_DEBUG("Using CPU backend"); } @@ -452,11 +508,11 @@ ggml_backend_t SDBackendManager::params_backend(SDBackendModule module) { } bool SDBackendManager::runtime_backend_is_cpu(SDBackendModule module) { - return ggml_backend_is_cpu(runtime_backend(module)); + return sd_backend_is_cpu(runtime_backend(module)); } bool SDBackendManager::params_backend_is_cpu(SDBackendModule module) { - return ggml_backend_is_cpu(params_backend(module)); + return sd_backend_is_cpu(params_backend(module)); } bool SDBackendManager::runtime_backend_supports_host_buffer(SDBackendModule module) { @@ -464,7 +520,7 @@ bool SDBackendManager::runtime_backend_supports_host_buffer(SDBackendModule modu if (backend == nullptr) { return false; } - if (ggml_backend_is_cpu(backend)) { + if (sd_backend_is_cpu(backend)) { return true; } ggml_backend_dev_t dev = ggml_backend_get_device(backend); diff --git a/src/ggml_extend_backend.h b/src/ggml_extend_backend.h index b98efc10d..972fbee73 100644 --- a/src/ggml_extend_backend.h +++ b/src/ggml_extend_backend.h @@ -8,7 +8,6 @@ #include #include "ggml-backend.h" -#include "ggml-cpu.h" #include "ggml.h" enum class SDBackendModule { @@ -72,6 +71,9 @@ class SDBackendManager { }; bool sd_backend_is(ggml_backend_t backend, const std::string& name); +bool sd_backend_is_cpu(ggml_backend_t backend); +ggml_backend_t sd_backend_cpu_init(); +bool sd_backend_cpu_set_n_threads(ggml_backend_t backend_cpu, int n_threads); const char* sd_backend_module_name(SDBackendModule module); void ggml_ext_im_set_f32_1d(const struct ggml_tensor* tensor, int i, float value); #endif diff --git a/src/llm.hpp b/src/llm.hpp index 08cf784bb..0dbd37d95 100644 --- a/src/llm.hpp +++ b/src/llm.hpp @@ -2041,7 +2041,7 @@ namespace LLM { static void load_from_file_and_test(const std::string& file_path) { // cpu f16: pass // ggml_backend_t backend = ggml_backend_cuda_init(0); - ggml_backend_t backend = ggml_backend_cpu_init(); + ggml_backend_t backend = sd_backend_cpu_init(); ggml_type model_data_type = GGML_TYPE_COUNT; ModelLoader model_loader; diff --git a/src/lora.hpp b/src/lora.hpp index 17012c0ed..d87b494c6 100644 --- a/src/lora.hpp +++ b/src/lora.hpp @@ -772,7 +772,7 @@ struct LoraModel : public GGMLRunner { } ggml_tensor* original_tensor = model_tensor; - if (!ggml_backend_is_cpu(runtime_backend) && ggml_backend_buffer_is_host(original_tensor->buffer)) { + if (!sd_backend_is_cpu(runtime_backend) && ggml_backend_buffer_is_host(original_tensor->buffer)) { model_tensor = ggml_dup_tensor(compute_ctx, model_tensor); set_backend_tensor_data(model_tensor, original_tensor->data); } @@ -786,7 +786,7 @@ struct LoraModel : public GGMLRunner { final_tensor = ggml_add_inplace(compute_ctx, model_tensor, diff); } ggml_build_forward_expand(gf, final_tensor); - if (!ggml_backend_is_cpu(runtime_backend) && ggml_backend_buffer_is_host(original_tensor->buffer)) { + if (!sd_backend_is_cpu(runtime_backend) && ggml_backend_buffer_is_host(original_tensor->buffer)) { original_tensor_to_final_tensor[original_tensor] = final_tensor; } } diff --git a/src/ltx_audio_vae.h b/src/ltx_audio_vae.h index 490d282c7..88c376314 100644 --- a/src/ltx_audio_vae.h +++ b/src/ltx_audio_vae.h @@ -1052,7 +1052,7 @@ namespace LTXV { static void load_from_file_and_test(const std::string& model_path, const std::string& input_path, const std::string& prefix = "") { - ggml_backend_t backend = ggml_backend_cpu_init(); + ggml_backend_t backend = sd_backend_cpu_init(); // ggml_backend_t backend = ggml_backend_cuda_init(0); LOG_INFO("loading ltx audio vae from '%s'", model_path.c_str()); diff --git a/src/ltx_vae.hpp b/src/ltx_vae.hpp index 5f5467be5..13ec0e399 100644 --- a/src/ltx_vae.hpp +++ b/src/ltx_vae.hpp @@ -1517,7 +1517,7 @@ struct LTXVideoVAE : public VAE { static void load_from_file_and_test(const std::string& model_path, const std::string& input_path) { // ggml_backend_t backend = ggml_backend_cuda_init(0); - ggml_backend_t backend = ggml_backend_cpu_init(); + ggml_backend_t backend = sd_backend_cpu_init(); LOG_INFO("loading ltx vae from '%s'", model_path.c_str()); ModelLoader model_loader; diff --git a/src/ltxv.hpp b/src/ltxv.hpp index 899532e3e..a7d3fb04e 100644 --- a/src/ltxv.hpp +++ b/src/ltxv.hpp @@ -1995,7 +1995,7 @@ namespace LTXV { const std::string& audio_x_path = "", const std::string& audio_timesteps_path = "") { // ggml_backend_t backend = ggml_backend_cuda_init(0); - ggml_backend_t backend = ggml_backend_cpu_init(); + ggml_backend_t backend = sd_backend_cpu_init(); LOG_INFO("loading ltxav from '%s'", model_path.c_str()); ModelLoader model_loader; diff --git a/src/mmdit.hpp b/src/mmdit.hpp index 71f8771f6..45bc2d916 100644 --- a/src/mmdit.hpp +++ b/src/mmdit.hpp @@ -947,7 +947,7 @@ struct MMDiTRunner : public DiffusionModelRunner { static void load_from_file_and_test(const std::string& file_path) { // ggml_backend_t backend = ggml_backend_cuda_init(0); - ggml_backend_t backend = ggml_backend_cpu_init(); + ggml_backend_t backend = sd_backend_cpu_init(); ggml_type model_data_type = GGML_TYPE_F16; std::shared_ptr mmdit = std::make_shared(backend, backend); { diff --git a/src/qwen_image.hpp b/src/qwen_image.hpp index c0a2f98da..bea71b97f 100644 --- a/src/qwen_image.hpp +++ b/src/qwen_image.hpp @@ -683,7 +683,7 @@ namespace Qwen { // cuda q8: pass // cuda q8 fa: pass // ggml_backend_t backend = ggml_backend_cuda_init(0); - ggml_backend_t backend = ggml_backend_cpu_init(); + ggml_backend_t backend = sd_backend_cpu_init(); ggml_type model_data_type = GGML_TYPE_Q8_0; ModelLoader model_loader; diff --git a/src/stable-diffusion.cpp b/src/stable-diffusion.cpp index 28598eb67..3c6dbc774 100644 --- a/src/stable-diffusion.cpp +++ b/src/stable-diffusion.cpp @@ -1095,7 +1095,7 @@ class StableDiffusionGGML { if (module_backend == nullptr) { return false; } - if (ggml_backend_is_cpu(module_backend)) { + if (sd_backend_is_cpu(module_backend)) { total_params_ram_size += size; } else { total_params_vram_size += size; @@ -1110,7 +1110,7 @@ class StableDiffusionGGML { if (module_backend == nullptr) { return "N/A"; } - return ggml_backend_is_cpu(module_backend) ? "RAM" : "VRAM"; + return sd_backend_is_cpu(module_backend) ? "RAM" : "VRAM"; }; if (!add_params_memory(clip_params_mem_size, SDBackendModule::TE) || diff --git a/src/t5.hpp b/src/t5.hpp index e729bda60..9b2bdaef1 100644 --- a/src/t5.hpp +++ b/src/t5.hpp @@ -563,7 +563,7 @@ struct T5Embedder { // cuda f32: pass // cuda q8_0: pass // ggml_backend_t backend = ggml_backend_cuda_init(0); - ggml_backend_t backend = ggml_backend_cpu_init(); + ggml_backend_t backend = sd_backend_cpu_init(); ggml_type model_data_type = GGML_TYPE_F16; ModelLoader model_loader; diff --git a/src/util.cpp b/src/util.cpp index 77fc5429c..6f5f1990a 100644 --- a/src/util.cpp +++ b/src/util.cpp @@ -25,9 +25,7 @@ #include #endif -#include "ggml-backend.h" #include "ggml.h" -#include "ggml_extend_backend.h" #include "stable-diffusion.h" bool ends_with(const std::string& str, const std::string& ending) { @@ -974,30 +972,3 @@ std::vector> split_quotation_attention( } return result; } - -// namespace is needed to avoid conflicts with ggml_backend_extend.hpp -namespace ggml_cpu { -#include "ggml-cpu.h" -} - -const char* sd_get_system_info() { - using namespace ggml_cpu; - static char buffer[1024]; - std::stringstream ss; - ss << "System Info: \n"; - ss << " SSE3 = " << ggml_cpu_has_sse3() << " | "; - ss << " AVX = " << ggml_cpu_has_avx() << " | "; - ss << " AVX2 = " << ggml_cpu_has_avx2() << " | "; - ss << " AVX512 = " << ggml_cpu_has_avx512() << " | "; - ss << " AVX512_VBMI = " << ggml_cpu_has_avx512_vbmi() << " | "; - ss << " AVX512_VNNI = " << ggml_cpu_has_avx512_vnni() << " | "; - ss << " FMA = " << ggml_cpu_has_fma() << " | "; - ss << " NEON = " << ggml_cpu_has_neon() << " | "; - ss << " ARM_FMA = " << ggml_cpu_has_arm_fma() << " | "; - ss << " F16C = " << ggml_cpu_has_f16c() << " | "; - ss << " FP16_VA = " << ggml_cpu_has_fp16_va() << " | "; - ss << " WASM_SIMD = " << ggml_cpu_has_wasm_simd() << " | "; - ss << " VSX = " << ggml_cpu_has_vsx() << " | "; - snprintf(buffer, sizeof(buffer), "%s", ss.str().c_str()); - return buffer; -} diff --git a/src/wan.hpp b/src/wan.hpp index 84c493ce5..68f020e25 100644 --- a/src/wan.hpp +++ b/src/wan.hpp @@ -1328,7 +1328,7 @@ namespace WAN { static void load_from_file_and_test(const std::string& file_path) { // ggml_backend_t backend = ggml_backend_cuda_init(0); - ggml_backend_t backend = ggml_backend_cpu_init(); + ggml_backend_t backend = sd_backend_cpu_init(); ggml_type model_data_type = GGML_TYPE_F16; std::shared_ptr vae = std::make_shared(backend, backend, String2TensorStorage{}, "", false, VERSION_WAN2_2_TI2V); { @@ -2348,7 +2348,7 @@ namespace WAN { static void load_from_file_and_test(const std::string& file_path) { // ggml_backend_t backend = ggml_backend_cuda_init(0); - ggml_backend_t backend = ggml_backend_cpu_init(); + ggml_backend_t backend = sd_backend_cpu_init(); ggml_type model_data_type = GGML_TYPE_F16; LOG_INFO("loading from '%s'", file_path.c_str()); diff --git a/src/z_image.hpp b/src/z_image.hpp index 5e1bdaed3..82dbe0491 100644 --- a/src/z_image.hpp +++ b/src/z_image.hpp @@ -615,7 +615,7 @@ namespace ZImage { // cuda q8: pass // cuda q8 fa: pass // ggml_backend_t backend = ggml_backend_cuda_init(0); - ggml_backend_t backend = ggml_backend_cpu_init(); + ggml_backend_t backend = sd_backend_cpu_init(); ggml_type model_data_type = GGML_TYPE_Q8_0; ModelLoader model_loader;