Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 2 additions & 2 deletions src/convert.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -8,7 +8,7 @@
#include "model_io/safetensors_io.h"
#include "util.h"

#include "ggml-cpu.h"
#include "ggml_extend_backend.h"

static ggml_type get_export_tensor_type(ModelLoader& model_loader,
const TensorStorage& tensor_storage,
Expand Down Expand Up @@ -103,7 +103,7 @@ bool convert(const char* input_path,
bool output_is_safetensors = ends_with(output_path, ".safetensors");
TensorTypeRules type_rules = parse_tensor_type_rules(tensor_type_rules);

auto backend = ggml_backend_cpu_init();
auto backend = sd_backend_cpu_init();
size_t mem_size = 1 * 1024 * 1024; // for padding
mem_size += model_loader.get_tensor_storage_map().size() * ggml_tensor_overhead();
mem_size += model_loader.get_params_mem_size(backend, type);
Expand Down
2 changes: 1 addition & 1 deletion src/flux.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -1567,7 +1567,7 @@ namespace Flux {

static void load_from_file_and_test(const std::string& file_path) {
// ggml_backend_t backend = ggml_backend_cuda_init(0);
ggml_backend_t backend = ggml_backend_cpu_init();
ggml_backend_t backend = sd_backend_cpu_init();
ggml_type model_data_type = GGML_TYPE_COUNT;

ModelLoader model_loader;
Expand Down
18 changes: 9 additions & 9 deletions src/ggml_extend.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -1442,7 +1442,7 @@ __STATIC_INLINE__ ggml_tensor* ggml_ext_group_norm(ggml_context* ctx,

__STATIC_INLINE__ void ggml_ext_backend_tensor_get_and_sync(ggml_backend_t backend, const ggml_tensor* tensor, void* data, size_t offset, size_t size) {
if ((sd_backend_is(backend, "ROCm") || sd_backend_is(backend, "CUDA") || sd_backend_is(backend, "SYCL")) &&
!ggml_backend_is_cpu(backend)) {
!sd_backend_is_cpu(backend)) {
ggml_backend_tensor_get_async(backend, tensor, data, offset, size);
ggml_backend_synchronize(backend);
return;
Expand Down Expand Up @@ -1899,7 +1899,7 @@ struct GGMLRunner {
LOG_DEBUG("%s compute buffer size: %.2f MB(%s)",
get_desc().c_str(),
compute_buffer_size / 1024.0 / 1024.0,
ggml_backend_is_cpu(runtime_backend) ? "RAM" : "VRAM");
sd_backend_is_cpu(runtime_backend) ? "RAM" : "VRAM");
return true;
}

Expand Down Expand Up @@ -1986,7 +1986,7 @@ struct GGMLRunner {
LOG_DEBUG("%s cache backend buffer size = % 6.2f MB(%s) (%i tensors)",
get_desc().c_str(),
cache_buffer_size / (1024.f * 1024.f),
ggml_backend_is_cpu(runtime_backend) ? "RAM" : "VRAM",
sd_backend_is_cpu(runtime_backend) ? "RAM" : "VRAM",
num_tensors);
if (old_cache_buffer != nullptr) {
ggml_backend_buffer_free(old_cache_buffer);
Expand Down Expand Up @@ -2293,13 +2293,13 @@ struct GGMLRunner {
max_graph_vram_bytes > 0 &&
plan.segments.size() > 1 &&
params_backend != runtime_backend &&
!ggml_backend_is_cpu(runtime_backend);
!sd_backend_is_cpu(runtime_backend);
}

bool can_attempt_graph_cut_segmented_compute() const {
return max_graph_vram_bytes > 0 &&
params_backend != runtime_backend &&
!ggml_backend_is_cpu(runtime_backend);
!sd_backend_is_cpu(runtime_backend);
}

bool resolve_graph_cut_plan(ggml_cgraph* gf,
Expand Down Expand Up @@ -2436,8 +2436,8 @@ struct GGMLRunner {
int64_t t_copy_begin = ggml_time_ms();
copy_data_to_backend_tensor(gf, !preserve_backend_tensor_data_map);
int64_t t_copy_end = ggml_time_ms();
if (ggml_backend_is_cpu(runtime_backend)) {
ggml_backend_cpu_set_n_threads(runtime_backend, n_threads);
if (sd_backend_is_cpu(runtime_backend)) {
sd_backend_cpu_set_n_threads(runtime_backend, n_threads);
}

int64_t t_compute_begin = ggml_time_ms();
Expand Down Expand Up @@ -2679,7 +2679,7 @@ struct GGMLRunner {
LOG_DEBUG("%s params backend buffer size = % 6.2f MB(%s) (%i tensors)",
get_desc().c_str(),
params_buffer_size / (1024.f * 1024.f),
ggml_backend_is_cpu(params_backend) ? "RAM" : "VRAM",
sd_backend_is_cpu(params_backend) ? "RAM" : "VRAM",
num_tensors);
return true;
}
Expand Down Expand Up @@ -2746,7 +2746,7 @@ struct GGMLRunner {
return nullptr;
}
// it's performing a compute, check if backend isn't cpu
if (!ggml_backend_is_cpu(runtime_backend) && (tensor->buffer == nullptr || ggml_backend_buffer_is_host(tensor->buffer))) {
if (!sd_backend_is_cpu(runtime_backend) && (tensor->buffer == nullptr || ggml_backend_buffer_is_host(tensor->buffer))) {
// pass input tensors to gpu memory
auto backend_tensor = ggml_dup_tensor(compute_ctx, tensor);

Expand Down
66 changes: 61 additions & 5 deletions src/ggml_extend_backend.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,7 @@
#include <stdexcept>
#include <vector>

#include "stable-diffusion.h"
#include "util.h"

static std::string trim_copy(const std::string& value) {
Expand Down Expand Up @@ -300,6 +301,61 @@ static ggml_backend_t init_named_backend(const std::string& name) {
return ggml_backend_init_by_name(resolved.c_str(), nullptr);
}

bool sd_backend_is_cpu(ggml_backend_t backend) {
if (backend == nullptr) {
return false;
}
auto dev = ggml_backend_get_device(backend);
return dev != nullptr && ggml_backend_dev_type(dev) == GGML_BACKEND_DEVICE_TYPE_CPU;
}

ggml_backend_t sd_backend_cpu_init() {
ggml_backend_load_all_once();
return ggml_backend_init_by_type(GGML_BACKEND_DEVICE_TYPE_CPU, nullptr);
}

bool sd_backend_cpu_set_n_threads(ggml_backend_t backend, int n_threads) {
if (backend == nullptr) {
return false;
}
auto dev = ggml_backend_get_device(backend);
if (dev != nullptr && ggml_backend_dev_type(dev) == GGML_BACKEND_DEVICE_TYPE_CPU) {
auto reg = ggml_backend_dev_backend_reg(dev);
auto ggml_backend_set_n_threads_fn = (ggml_backend_set_n_threads_t)ggml_backend_reg_get_proc_address(reg, "ggml_backend_set_n_threads");
if (ggml_backend_set_n_threads_fn != nullptr) {
ggml_backend_set_n_threads_fn(backend, n_threads);
return true;
}
}
return false;
}

const char* sd_get_system_info() {
static std::string cache_info = []() -> std::string {
ggml_backend_load_all_once();
std::stringstream ss;
ss << "System Info: \n";
auto dev = ggml_backend_dev_by_type(GGML_BACKEND_DEVICE_TYPE_CPU);
if (dev != nullptr) {
auto reg = ggml_backend_dev_backend_reg(dev);
auto ggml_backend_get_features_fn = (ggml_backend_get_features_t)ggml_backend_reg_get_proc_address(reg, "ggml_backend_get_features");
if (ggml_backend_get_features_fn != nullptr) {
ggml_backend_feature* feat = ggml_backend_get_features_fn(reg);
while (feat->name && feat->value) {
ss << " " << feat->name << " = " << feat->value << " | ";
feat++;
}
} else {
LOG_WARN("unable to get CPU features");
}
} else {
LOG_WARN("unable to get CPU features");
}
return ss.str();
}();
return cache_info.c_str();
}

static ggml_backend_t sd_get_default_backend() {
ggml_backend_load_all_once();
static std::once_flag once;
Expand Down Expand Up @@ -349,10 +405,10 @@ static ggml_backend_t sd_get_default_backend() {

if (!backend) {
LOG_WARN("loading CPU backend");
backend = ggml_backend_cpu_init();
backend = sd_backend_cpu_init();
}

if (ggml_backend_is_cpu(backend)) {
if (sd_backend_is_cpu(backend)) {
LOG_DEBUG("Using CPU backend");
}

Expand Down Expand Up @@ -452,19 +508,19 @@ ggml_backend_t SDBackendManager::params_backend(SDBackendModule module) {
}

bool SDBackendManager::runtime_backend_is_cpu(SDBackendModule module) {
return ggml_backend_is_cpu(runtime_backend(module));
return sd_backend_is_cpu(runtime_backend(module));
}

bool SDBackendManager::params_backend_is_cpu(SDBackendModule module) {
return ggml_backend_is_cpu(params_backend(module));
return sd_backend_is_cpu(params_backend(module));
}

bool SDBackendManager::runtime_backend_supports_host_buffer(SDBackendModule module) {
ggml_backend_t backend = runtime_backend(module);
if (backend == nullptr) {
return false;
}
if (ggml_backend_is_cpu(backend)) {
if (sd_backend_is_cpu(backend)) {
return true;
}
ggml_backend_dev_t dev = ggml_backend_get_device(backend);
Expand Down
4 changes: 3 additions & 1 deletion src/ggml_extend_backend.h
Original file line number Diff line number Diff line change
Expand Up @@ -8,7 +8,6 @@
#include <unordered_map>

#include "ggml-backend.h"
#include "ggml-cpu.h"
#include "ggml.h"

enum class SDBackendModule {
Expand Down Expand Up @@ -72,6 +71,9 @@ class SDBackendManager {
};

bool sd_backend_is(ggml_backend_t backend, const std::string& name);
bool sd_backend_is_cpu(ggml_backend_t backend);
ggml_backend_t sd_backend_cpu_init();
bool sd_backend_cpu_set_n_threads(ggml_backend_t backend_cpu, int n_threads);
const char* sd_backend_module_name(SDBackendModule module);
void ggml_ext_im_set_f32_1d(const struct ggml_tensor* tensor, int i, float value);
#endif
2 changes: 1 addition & 1 deletion src/llm.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -2041,7 +2041,7 @@ namespace LLM {
static void load_from_file_and_test(const std::string& file_path) {
// cpu f16: pass
// ggml_backend_t backend = ggml_backend_cuda_init(0);
ggml_backend_t backend = ggml_backend_cpu_init();
ggml_backend_t backend = sd_backend_cpu_init();
ggml_type model_data_type = GGML_TYPE_COUNT;

ModelLoader model_loader;
Expand Down
4 changes: 2 additions & 2 deletions src/lora.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -772,7 +772,7 @@ struct LoraModel : public GGMLRunner {
}

ggml_tensor* original_tensor = model_tensor;
if (!ggml_backend_is_cpu(runtime_backend) && ggml_backend_buffer_is_host(original_tensor->buffer)) {
if (!sd_backend_is_cpu(runtime_backend) && ggml_backend_buffer_is_host(original_tensor->buffer)) {
model_tensor = ggml_dup_tensor(compute_ctx, model_tensor);
set_backend_tensor_data(model_tensor, original_tensor->data);
}
Expand All @@ -786,7 +786,7 @@ struct LoraModel : public GGMLRunner {
final_tensor = ggml_add_inplace(compute_ctx, model_tensor, diff);
}
ggml_build_forward_expand(gf, final_tensor);
if (!ggml_backend_is_cpu(runtime_backend) && ggml_backend_buffer_is_host(original_tensor->buffer)) {
if (!sd_backend_is_cpu(runtime_backend) && ggml_backend_buffer_is_host(original_tensor->buffer)) {
original_tensor_to_final_tensor[original_tensor] = final_tensor;
}
}
Expand Down
2 changes: 1 addition & 1 deletion src/ltx_audio_vae.h
Original file line number Diff line number Diff line change
Expand Up @@ -1052,7 +1052,7 @@ namespace LTXV {
static void load_from_file_and_test(const std::string& model_path,
const std::string& input_path,
const std::string& prefix = "") {
ggml_backend_t backend = ggml_backend_cpu_init();
ggml_backend_t backend = sd_backend_cpu_init();
// ggml_backend_t backend = ggml_backend_cuda_init(0);
LOG_INFO("loading ltx audio vae from '%s'", model_path.c_str());

Expand Down
2 changes: 1 addition & 1 deletion src/ltx_vae.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -1517,7 +1517,7 @@ struct LTXVideoVAE : public VAE {
static void load_from_file_and_test(const std::string& model_path,
const std::string& input_path) {
// ggml_backend_t backend = ggml_backend_cuda_init(0);
ggml_backend_t backend = ggml_backend_cpu_init();
ggml_backend_t backend = sd_backend_cpu_init();
LOG_INFO("loading ltx vae from '%s'", model_path.c_str());

ModelLoader model_loader;
Expand Down
2 changes: 1 addition & 1 deletion src/ltxv.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -1995,7 +1995,7 @@ namespace LTXV {
const std::string& audio_x_path = "",
const std::string& audio_timesteps_path = "") {
// ggml_backend_t backend = ggml_backend_cuda_init(0);
ggml_backend_t backend = ggml_backend_cpu_init();
ggml_backend_t backend = sd_backend_cpu_init();
LOG_INFO("loading ltxav from '%s'", model_path.c_str());

ModelLoader model_loader;
Expand Down
2 changes: 1 addition & 1 deletion src/mmdit.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -947,7 +947,7 @@ struct MMDiTRunner : public DiffusionModelRunner {

static void load_from_file_and_test(const std::string& file_path) {
// ggml_backend_t backend = ggml_backend_cuda_init(0);
ggml_backend_t backend = ggml_backend_cpu_init();
ggml_backend_t backend = sd_backend_cpu_init();
ggml_type model_data_type = GGML_TYPE_F16;
std::shared_ptr<MMDiTRunner> mmdit = std::make_shared<MMDiTRunner>(backend, backend);
{
Expand Down
2 changes: 1 addition & 1 deletion src/qwen_image.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -683,7 +683,7 @@ namespace Qwen {
// cuda q8: pass
// cuda q8 fa: pass
// ggml_backend_t backend = ggml_backend_cuda_init(0);
ggml_backend_t backend = ggml_backend_cpu_init();
ggml_backend_t backend = sd_backend_cpu_init();
ggml_type model_data_type = GGML_TYPE_Q8_0;

ModelLoader model_loader;
Expand Down
4 changes: 2 additions & 2 deletions src/stable-diffusion.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -1095,7 +1095,7 @@ class StableDiffusionGGML {
if (module_backend == nullptr) {
return false;
}
if (ggml_backend_is_cpu(module_backend)) {
if (sd_backend_is_cpu(module_backend)) {
total_params_ram_size += size;
} else {
total_params_vram_size += size;
Expand All @@ -1110,7 +1110,7 @@ class StableDiffusionGGML {
if (module_backend == nullptr) {
return "N/A";
}
return ggml_backend_is_cpu(module_backend) ? "RAM" : "VRAM";
return sd_backend_is_cpu(module_backend) ? "RAM" : "VRAM";
};

if (!add_params_memory(clip_params_mem_size, SDBackendModule::TE) ||
Expand Down
2 changes: 1 addition & 1 deletion src/t5.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -563,7 +563,7 @@ struct T5Embedder {
// cuda f32: pass
// cuda q8_0: pass
// ggml_backend_t backend = ggml_backend_cuda_init(0);
ggml_backend_t backend = ggml_backend_cpu_init();
ggml_backend_t backend = sd_backend_cpu_init();
ggml_type model_data_type = GGML_TYPE_F16;

ModelLoader model_loader;
Expand Down
29 changes: 0 additions & 29 deletions src/util.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -25,9 +25,7 @@
#include <unistd.h>
#endif

#include "ggml-backend.h"
#include "ggml.h"
#include "ggml_extend_backend.h"
#include "stable-diffusion.h"

bool ends_with(const std::string& str, const std::string& ending) {
Expand Down Expand Up @@ -974,30 +972,3 @@ std::vector<std::pair<std::string, float>> split_quotation_attention(
}
return result;
}

// namespace is needed to avoid conflicts with ggml_backend_extend.hpp
namespace ggml_cpu {
#include "ggml-cpu.h"
}

const char* sd_get_system_info() {
using namespace ggml_cpu;
static char buffer[1024];
std::stringstream ss;
ss << "System Info: \n";
ss << " SSE3 = " << ggml_cpu_has_sse3() << " | ";
ss << " AVX = " << ggml_cpu_has_avx() << " | ";
ss << " AVX2 = " << ggml_cpu_has_avx2() << " | ";
ss << " AVX512 = " << ggml_cpu_has_avx512() << " | ";
ss << " AVX512_VBMI = " << ggml_cpu_has_avx512_vbmi() << " | ";
ss << " AVX512_VNNI = " << ggml_cpu_has_avx512_vnni() << " | ";
ss << " FMA = " << ggml_cpu_has_fma() << " | ";
ss << " NEON = " << ggml_cpu_has_neon() << " | ";
ss << " ARM_FMA = " << ggml_cpu_has_arm_fma() << " | ";
ss << " F16C = " << ggml_cpu_has_f16c() << " | ";
ss << " FP16_VA = " << ggml_cpu_has_fp16_va() << " | ";
ss << " WASM_SIMD = " << ggml_cpu_has_wasm_simd() << " | ";
ss << " VSX = " << ggml_cpu_has_vsx() << " | ";
snprintf(buffer, sizeof(buffer), "%s", ss.str().c_str());
return buffer;
}
4 changes: 2 additions & 2 deletions src/wan.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -1328,7 +1328,7 @@ namespace WAN {

static void load_from_file_and_test(const std::string& file_path) {
// ggml_backend_t backend = ggml_backend_cuda_init(0);
ggml_backend_t backend = ggml_backend_cpu_init();
ggml_backend_t backend = sd_backend_cpu_init();
ggml_type model_data_type = GGML_TYPE_F16;
std::shared_ptr<WanVAERunner> vae = std::make_shared<WanVAERunner>(backend, backend, String2TensorStorage{}, "", false, VERSION_WAN2_2_TI2V);
{
Expand Down Expand Up @@ -2348,7 +2348,7 @@ namespace WAN {

static void load_from_file_and_test(const std::string& file_path) {
// ggml_backend_t backend = ggml_backend_cuda_init(0);
ggml_backend_t backend = ggml_backend_cpu_init();
ggml_backend_t backend = sd_backend_cpu_init();
ggml_type model_data_type = GGML_TYPE_F16;
LOG_INFO("loading from '%s'", file_path.c_str());

Expand Down
2 changes: 1 addition & 1 deletion src/z_image.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -615,7 +615,7 @@ namespace ZImage {
// cuda q8: pass
// cuda q8 fa: pass
// ggml_backend_t backend = ggml_backend_cuda_init(0);
ggml_backend_t backend = ggml_backend_cpu_init();
ggml_backend_t backend = sd_backend_cpu_init();
ggml_type model_data_type = GGML_TYPE_Q8_0;

ModelLoader model_loader;
Expand Down
Loading