From bc1551b40b3530e8672b9cbe32667270ef052164 Mon Sep 17 00:00:00 2001
From: rmatif <kingrealriadh@gmail.com>
Date: Sat, 13 Sep 2025 09:48:44 +0000
Subject: [PATCH 1/2] allow the use of absolute paths for lora and embeddings

---
 conditioner.hpp      | 312 ++++++++++++++++++++++++++++++++++---------
 stable-diffusion.cpp |  62 ++++-----
 2 files changed, 283 insertions(+), 91 deletions(-)
diff --git a/conditioner.hpp b/conditioner.hpp
index cfd2b4ca7..0f8edac1d 100644
--- a/conditioner.hpp
+++ b/conditioner.hpp
@@ -196,27 +196,91 @@ struct FrozenCLIPEmbedderWithCustomWords : public Conditioner {
     }
 
     std::vector<int> convert_token_to_id(std::string text) {
+        size_t search_pos = 0;
         auto on_new_token_cb = [&](std::string& str, std::vector<int32_t>& bpe_tokens) -> bool {
-            size_t word_end       = str.find(",");
-            std::string embd_name = word_end == std::string::npos ? str : str.substr(0, word_end);
-            embd_name             = trim(embd_name);
-            std::string embd_path = get_full_path(embd_dir, embd_name + ".pt");
-            if (embd_path.size() == 0) {
-                embd_path = get_full_path(embd_dir, embd_name + ".ckpt");
+            std::string token_str;
+            size_t consumed_len = 0;
+            bool is_embed_tag = false;
+
+            // The tokenizer gives us chunks of text. We only process the first potential embedding token in that chunk.
+            std::string trimmed_str = trim(str);
+            size_t leading_spaces = str.length() - trimmed_str.length();
+
+            if (starts_with(trimmed_str, "<embed:")) {
+                size_t tag_end = trimmed_str.find(">");
+                if (tag_end == std::string::npos) {
+                    return false; // Incomplete tag.
+                }
+                std::string lower_tag = trimmed_str.substr(0, tag_end + 1);
+                token_str             = lower_tag;  // Fallback to lowercased version
+
+                if (text.length() >= lower_tag.length()) {
+                    for (size_t i = search_pos; i <= text.length() - lower_tag.length(); ++i) {
+                        bool match = true;
+                        for (size_t j = 0; j < lower_tag.length(); ++j) {
+                            if (std::tolower(text[i + j]) != lower_tag[j]) {
+                                match = false;
+                                break;
+                            }
+                        }
+                        if (match) {
+                            token_str  = text.substr(i, lower_tag.length());
+                            search_pos = i + token_str.length();
+                            break;
+                        }
+                    }
+                }
+                consumed_len = leading_spaces + token_str.length();
+                is_embed_tag = true;
+            } else {
+                // Not a tag. Could be a plain trigger word.
+                size_t first_delim = trimmed_str.find_first_of(" ,");
+                token_str = (first_delim == std::string::npos) ? trimmed_str : trimmed_str.substr(0, first_delim);
+                consumed_len = leading_spaces + token_str.length();
+            }
+
+            std::string embd_name = trim(token_str);
+            if (is_embed_tag) {
+                embd_name = embd_name.substr(strlen("<embed:"), embd_name.length() - strlen("<embed:") - 1);
             }
-            if (embd_path.size() == 0) {
-                embd_path = get_full_path(embd_dir, embd_name + ".safetensors");
+
+            std::string embd_path;
+            bool is_path = contains(embd_name, "/") || contains(embd_name, "\\");
+
+            if (is_path) {
+                if (file_exists(embd_name)) {
+                    embd_path = embd_name;
+                } else if (file_exists(embd_name + ".safetensors")) {
+                    embd_path = embd_name + ".safetensors";
+                } else if (file_exists(embd_name + ".pt")) {
+                    embd_path = embd_name + ".pt";
+                } else if (file_exists(embd_name + ".ckpt")) {
+                    embd_path = embd_name + ".ckpt";
+                }
+            } else {
+                embd_path = get_full_path(embd_dir, embd_name + ".pt");
+                if (embd_path.size() == 0) {
+                    embd_path = get_full_path(embd_dir, embd_name + ".ckpt");
+                }
+                if (embd_path.size() == 0) {
+                    embd_path = get_full_path(embd_dir, embd_name + ".safetensors");
+                }
             }
+
             if (embd_path.size() > 0) {
                 if (load_embedding(embd_name, embd_path, bpe_tokens)) {
-                    if (word_end != std::string::npos) {
-                        str = str.substr(word_end);
-                    } else {
-                        str = "";
-                    }
+                    str = str.substr(consumed_len);
                     return true;
                 }
             }
+
+            if (is_embed_tag) {
+                LOG_WARN("could not load embedding '%s'", embd_name.c_str());
+                str = str.substr(consumed_len);
+                return true; // Consume the failed tag so the tokenizer doesn't try to parse it as text.
+            }
+
+            // It was not a tag and we couldn't find a file for it as a trigger word.
             return false;
         };
         std::vector<int> curr_tokens = tokenizer.encode(text, on_new_token_cb);
@@ -245,30 +309,6 @@ struct FrozenCLIPEmbedderWithCustomWords : public Conditioner {
             LOG_DEBUG("parse '%s' to %s", text.c_str(), ss.str().c_str());
         }
 
-        auto on_new_token_cb = [&](std::string& str, std::vector<int32_t>& bpe_tokens) -> bool {
-            size_t word_end       = str.find(",");
-            std::string embd_name = word_end == std::string::npos ? str : str.substr(0, word_end);
-            embd_name             = trim(embd_name);
-            std::string embd_path = get_full_path(embd_dir, embd_name + ".pt");
-            if (embd_path.size() == 0) {
-                embd_path = get_full_path(embd_dir, embd_name + ".ckpt");
-            }
-            if (embd_path.size() == 0) {
-                embd_path = get_full_path(embd_dir, embd_name + ".safetensors");
-            }
-            if (embd_path.size() > 0) {
-                if (load_embedding(embd_name, embd_path, bpe_tokens)) {
-                    if (word_end != std::string::npos) {
-                        str = str.substr(word_end);
-                    } else {
-                        str = "";
-                    }
-                    return true;
-                }
-            }
-            return false;
-        };
-
         std::vector<int> tokens;
         std::vector<float> weights;
         std::vector<bool> class_token_mask;
@@ -278,6 +318,93 @@ struct FrozenCLIPEmbedderWithCustomWords : public Conditioner {
             std::vector<int> clean_input_ids;
             const std::string& curr_text = item.first;
             float curr_weight            = item.second;
+            size_t search_pos            = 0;
+            auto on_new_token_cb         = [&](std::string& str, std::vector<int32_t>& bpe_tokens) -> bool {
+                std::string token_str;
+                size_t consumed_len = 0;
+                bool is_embed_tag   = false;
+
+                // The tokenizer gives us chunks of text. We only process the first potential embedding token in that chunk.
+                std::string trimmed_str = trim(str);
+                size_t leading_spaces   = str.length() - trimmed_str.length();
+
+                if (starts_with(trimmed_str, "<embed:")) {
+                    size_t tag_end = trimmed_str.find(">");
+                    if (tag_end == std::string::npos) {
+                        return false;  // Incomplete tag.
+                    }
+                    std::string lower_tag = trimmed_str.substr(0, tag_end + 1);
+                    token_str             = lower_tag;  // Fallback to lowercased version
+
+                    if (curr_text.length() >= lower_tag.length()) {
+                        for (size_t i = search_pos; i <= curr_text.length() - lower_tag.length(); ++i) {
+                            bool match = true;
+                            for (size_t j = 0; j < lower_tag.length(); ++j) {
+                                if (std::tolower(curr_text[i + j]) != lower_tag[j]) {
+                                    match = false;
+                                    break;
+                                }
+                            }
+                            if (match) {
+                                token_str  = curr_text.substr(i, lower_tag.length());
+                                search_pos = i + token_str.length();
+                                break;
+                            }
+                        }
+                    }
+                    consumed_len = leading_spaces + token_str.length();
+                    is_embed_tag = true;
+                } else {
+                    // Not a tag. Could be a plain trigger word.
+                    size_t first_delim = trimmed_str.find_first_of(" ,");
+                    token_str          = (first_delim == std::string::npos) ? trimmed_str : trimmed_str.substr(0, first_delim);
+                    consumed_len       = leading_spaces + token_str.length();
+                }
+
+                std::string embd_name = trim(token_str);
+                if (is_embed_tag) {
+                    embd_name = embd_name.substr(strlen("<embed:"), embd_name.length() - strlen("<embed:") - 1);
+                }
+
+                std::string embd_path;
+                bool is_path = contains(embd_name, "/") || contains(embd_name, "\\");
+
+                if (is_path) {
+                    if (file_exists(embd_name)) {
+                        embd_path = embd_name;
+                    } else if (file_exists(embd_name + ".safetensors")) {
+                        embd_path = embd_name + ".safetensors";
+                    } else if (file_exists(embd_name + ".pt")) {
+                        embd_path = embd_name + ".pt";
+                    } else if (file_exists(embd_name + ".ckpt")) {
+                        embd_path = embd_name + ".ckpt";
+                    }
+                } else {
+                    embd_path = get_full_path(embd_dir, embd_name + ".pt");
+                    if (embd_path.size() == 0) {
+                        embd_path = get_full_path(embd_dir, embd_name + ".ckpt");
+                    }
+                    if (embd_path.size() == 0) {
+                        embd_path = get_full_path(embd_dir, embd_name + ".safetensors");
+                    }
+                }
+
+                if (embd_path.size() > 0) {
+                    if (load_embedding(embd_name, embd_path, bpe_tokens)) {
+                        str = str.substr(consumed_len);
+                        return true;
+                    }
+                }
+
+                if (is_embed_tag) {
+                    LOG_WARN("could not load embedding '%s'", embd_name.c_str());
+                    str = str.substr(consumed_len);
+                    return true;  // Consume the failed tag so the tokenizer doesn't try to parse it as text.
+                }
+
+                // It was not a tag and we couldn't find a file for it as a trigger word.
+                return false;
+            };
             // printf(" %s: %f \n", curr_text.c_str(), curr_weight);
             std::vector<int> curr_tokens = tokenizer.encode(curr_text, on_new_token_cb);
             int32_t clean_index          = 0;
@@ -359,35 +486,98 @@ struct FrozenCLIPEmbedderWithCustomWords : public Conditioner {
             LOG_DEBUG("parse '%s' to %s", text.c_str(), ss.str().c_str());
         }
 
-        auto on_new_token_cb = [&](std::string& str, std::vector<int32_t>& bpe_tokens) -> bool {
-            size_t word_end       = str.find(",");
-            std::string embd_name = word_end == std::string::npos ? str : str.substr(0, word_end);
-            embd_name             = trim(embd_name);
-            std::string embd_path = get_full_path(embd_dir, embd_name + ".pt");
-            if (embd_path.size() == 0) {
-                embd_path = get_full_path(embd_dir, embd_name + ".ckpt");
-            }
-            if (embd_path.size() == 0) {
-                embd_path = get_full_path(embd_dir, embd_name + ".safetensors");
-            }
-            if (embd_path.size() > 0) {
-                if (load_embedding(embd_name, embd_path, bpe_tokens)) {
-                    if (word_end != std::string::npos) {
-                        str = str.substr(word_end);
-                    } else {
-                        str = "";
-                    }
-                    return true;
-                }
-            }
-            return false;
-        };
-
         std::vector<int> tokens;
         std::vector<float> weights;
         for (const auto& item : parsed_attention) {
             const std::string& curr_text = item.first;
             float curr_weight            = item.second;
+            size_t search_pos            = 0;
+            auto on_new_token_cb         = [&](std::string& str, std::vector<int32_t>& bpe_tokens) -> bool {
+                std::string token_str;
+                size_t consumed_len = 0;
+                bool is_embed_tag   = false;
+
+                // The tokenizer gives us chunks of text. We only process the first potential embedding token in that chunk.
+                std::string trimmed_str = trim(str);
+                size_t leading_spaces   = str.length() - trimmed_str.length();
+
+                if (starts_with(trimmed_str, "<embed:")) {
+                    size_t tag_end = trimmed_str.find(">");
+                    if (tag_end == std::string::npos) {
+                        return false;  // Incomplete tag.
+                    }
+                    std::string lower_tag = trimmed_str.substr(0, tag_end + 1);
+                    token_str             = lower_tag;  // Fallback to lowercased version
+
+                    if (curr_text.length() >= lower_tag.length()) {
+                        for (size_t i = search_pos; i <= curr_text.length() - lower_tag.length(); ++i) {
+                            bool match = true;
+                            for (size_t j = 0; j < lower_tag.length(); ++j) {
+                                if (std::tolower(curr_text[i + j]) != lower_tag[j]) {
+                                    match = false;
+                                    break;
+                                }
+                            }
+                            if (match) {
+                                token_str  = curr_text.substr(i, lower_tag.length());
+                                search_pos = i + token_str.length();
+                                break;
+                            }
+                        }
+                    }
+                    consumed_len = leading_spaces + token_str.length();
+                    is_embed_tag = true;
+                } else {
+                    // Not a tag. Could be a plain trigger word.
+                    size_t first_delim = trimmed_str.find_first_of(" ,");
+                    token_str          = (first_delim == std::string::npos) ? trimmed_str : trimmed_str.substr(0, first_delim);
+                    consumed_len       = leading_spaces + token_str.length();
+                }
+
+                std::string embd_name = trim(token_str);
+                if (is_embed_tag) {
+                    embd_name = embd_name.substr(strlen("<embed:"), embd_name.length() - strlen("<embed:") - 1);
+                }
+
+                std::string embd_path;
+                bool is_path = contains(embd_name, "/") || contains(embd_name, "\\");
+
+                if (is_path) {
+                    if (file_exists(embd_name)) {
+                        embd_path = embd_name;
+                    } else if (file_exists(embd_name + ".safetensors")) {
+                        embd_path = embd_name + ".safetensors";
+                    } else if (file_exists(embd_name + ".pt")) {
+                        embd_path = embd_name + ".pt";
+                    } else if (file_exists(embd_name + ".ckpt")) {
+                        embd_path = embd_name + ".ckpt";
+                    }
+                } else {
+                    embd_path = get_full_path(embd_dir, embd_name + ".pt");
+                    if (embd_path.size() == 0) {
+                        embd_path = get_full_path(embd_dir, embd_name + ".ckpt");
+                    }
+                    if (embd_path.size() == 0) {
+                        embd_path = get_full_path(embd_dir, embd_name + ".safetensors");
+                    }
+                }
+
+                if (embd_path.size() > 0) {
+                    if (load_embedding(embd_name, embd_path, bpe_tokens)) {
+                        str = str.substr(consumed_len);
+                        return true;
+                    }
+                }
+
+                if (is_embed_tag) {
+                    LOG_WARN("could not load embedding '%s'", embd_name.c_str());
+                    str = str.substr(consumed_len);
+                    return true;  // Consume the failed tag so the tokenizer doesn't try to parse it as text.
+                }
+
+                // It was not a tag and we couldn't find a file for it as a trigger word.
+                return false;
+            };
             std::vector<int> curr_tokens = tokenizer.encode(curr_text, on_new_token_cb);
             tokens.insert(tokens.end(), curr_tokens.begin(), curr_tokens.end());
             weights.insert(weights.end(), curr_tokens.size(), curr_weight);
diff --git a/stable-diffusion.cpp b/stable-diffusion.cpp
index db4e07cb0..0c0e4c828 100644
--- a/stable-diffusion.cpp
+++ b/stable-diffusion.cpp
@@ -145,6 +145,7 @@ class StableDiffusionGGML {
 #endif
 #ifdef SD_USE_METAL
         LOG_DEBUG("Using Metal backend");
+        ggml_log_set(ggml_log_callback_default, nullptr);
         backend = ggml_backend_metal_init();
 #endif
 #ifdef SD_USE_VULKAN
@@ -191,8 +192,6 @@ class StableDiffusionGGML {
             rng = std::make_shared<PhiloxRNG>();
         }
 
-        ggml_log_set(ggml_log_callback_default, nullptr);
-
         init_backend();
 
         ModelLoader model_loader;
@@ -331,7 +330,7 @@ class StableDiffusionGGML {
             if (sd_version_is_dit(version)) {
                 use_t5xxl = true;
             }
-            if (!clip_on_cpu && !ggml_backend_is_cpu(backend) && use_t5xxl) {
+            if (!ggml_backend_is_cpu(backend) && use_t5xxl) {
                 LOG_WARN(
                     "!!!It appears that you are using the T5 model. Some backends may encounter issues with it."
                     "If you notice that the generated images are completely black,"
@@ -345,12 +344,14 @@ class StableDiffusionGGML {
                 LOG_INFO("Using flash attention in the diffusion model");
             }
             if (sd_version_is_sd3(version)) {
+                if (sd_ctx_params->diffusion_flash_attn) {
+                    LOG_WARN("flash attention in this diffusion model is currently unsupported!");
+                }
                 cond_stage_model = std::make_shared<SD3CLIPEmbedder>(clip_backend,
                                                                      offload_params_to_cpu,
                                                                      model_loader.tensor_storages_types);
                 diffusion_model  = std::make_shared<MMDiTModel>(backend,
                                                                offload_params_to_cpu,
-                                                               sd_ctx_params->diffusion_flash_attn,
                                                                model_loader.tensor_storages_types);
             } else if (sd_version_is_flux(version)) {
                 bool is_chroma = false;
@@ -361,15 +362,6 @@ class StableDiffusionGGML {
                     }
                 }
                 if (is_chroma) {
-                    if (sd_ctx_params->diffusion_flash_attn && sd_ctx_params->chroma_use_dit_mask) {
-                        LOG_WARN(
-                            "!!!It looks like you are using Chroma with flash attention. "
-                            "This is currently unsupported. "
-                            "If you find that the generated images are broken, "
-                            "try either disabling flash attention or specifying "
-                            "--chroma-disable-dit-mask as a workaround.");
-                    }
-
                     cond_stage_model = std::make_shared<T5CLIPEmbedder>(clip_backend,
                                                                         offload_params_to_cpu,
                                                                         model_loader.tensor_storages_types,
@@ -581,7 +573,7 @@ class StableDiffusionGGML {
         if (version == VERSION_SVD) {
             ignore_tensors.insert("conditioner.embedders.3");
         }
-        bool success = model_loader.load_tensors(tensors, ignore_tensors);
+        bool success = model_loader.load_tensors(tensors, ignore_tensors, n_threads);
         if (!success) {
             LOG_ERROR("load tensors from model loader failed");
             ggml_free(ctx);
@@ -752,10 +744,6 @@ class StableDiffusionGGML {
                 denoiser->scheduler          = std::make_shared<GITSSchedule>();
                 denoiser->scheduler->version = version;
                 break;
-            case SMOOTHSTEP:
-                LOG_INFO("Running with SmoothStep scheduler");
-                denoiser->scheduler = std::make_shared<SmoothStepSchedule>();
-                break;
             case DEFAULT:
                 // Don't touch anything.
                 break;
@@ -810,17 +798,34 @@ class StableDiffusionGGML {
             is_high_noise = true;
             LOG_DEBUG("high noise lora: %s", lora_name.c_str());
         }
-        std::string st_file_path   = path_join(lora_model_dir, lora_name + ".safetensors");
-        std::string ckpt_file_path = path_join(lora_model_dir, lora_name + ".ckpt");
+        std::string st_file_path;
+        std::string ckpt_file_path;
         std::string file_path;
-        if (file_exists(st_file_path)) {
+        bool is_path = contains(lora_name, "/") || contains(lora_name, "\\");
+
+        if (is_path) {
+            st_file_path   = lora_name + ".safetensors";
+            ckpt_file_path = lora_name + ".ckpt";
+        } else {
+            st_file_path   = path_join(lora_model_dir, lora_name + ".safetensors");
+            ckpt_file_path = path_join(lora_model_dir, lora_name + ".ckpt");
+        }
+
+        if (is_path && file_exists(lora_name)) {
+            file_path = lora_name;
+        } else if (file_exists(st_file_path)) {
             file_path = st_file_path;
         } else if (file_exists(ckpt_file_path)) {
             file_path = ckpt_file_path;
         } else {
-            LOG_WARN("can not find %s or %s for lora %s", st_file_path.c_str(), ckpt_file_path.c_str(), lora_name.c_str());
+            if (is_path) {
+                LOG_WARN("can not find lora file %s, %s or %s", lora_name.c_str(), st_file_path.c_str(), ckpt_file_path.c_str());
+            } else {
+                LOG_WARN("can not find %s or %s for lora %s", st_file_path.c_str(), ckpt_file_path.c_str(), lora_name.c_str());
+            }
             return;
         }
+
         LoraModel lora(backend, file_path, is_high_noise ? "model.high_noise_" : "");
         if (!lora.load_from_file()) {
             LOG_WARN("load lora tensors from %s failed", file_path.c_str());
@@ -1539,7 +1544,6 @@ const char* schedule_to_str[] = {
     "exponential",
     "ays",
     "gits",
-    "smoothstep",
 };
 
 const char* sd_schedule_name(enum scheduler_t scheduler) {
@@ -1559,7 +1563,7 @@ enum scheduler_t str_to_schedule(const char* str) {
 }
 
 void sd_ctx_params_init(sd_ctx_params_t* sd_ctx_params) {
-    *sd_ctx_params                         = {};
+    memset((void*)sd_ctx_params, 0, sizeof(sd_ctx_params_t));
     sd_ctx_params->vae_decode_only         = true;
     sd_ctx_params->vae_tiling              = false;
     sd_ctx_params->free_params_immediately = true;
@@ -1643,7 +1647,6 @@ char* sd_ctx_params_to_str(const sd_ctx_params_t* sd_ctx_params) {
 }
 
 void sd_sample_params_init(sd_sample_params_t* sample_params) {
-    *sample_params                             = {};
     sample_params->guidance.txt_cfg            = 7.0f;
     sample_params->guidance.img_cfg            = INFINITY;
     sample_params->guidance.distilled_guidance = 3.5f;
@@ -1690,9 +1693,9 @@ char* sd_sample_params_to_str(const sd_sample_params_t* sample_params) {
 }
 
 void sd_img_gen_params_init(sd_img_gen_params_t* sd_img_gen_params) {
-    *sd_img_gen_params = {};
+    memset((void*)sd_img_gen_params, 0, sizeof(sd_img_gen_params_t));
+    sd_img_gen_params->clip_skip = -1;
     sd_sample_params_init(&sd_img_gen_params->sample_params);
-    sd_img_gen_params->clip_skip        = -1;
     sd_img_gen_params->ref_images_count = 0;
     sd_img_gen_params->width            = 512;
     sd_img_gen_params->height           = 512;
@@ -1749,7 +1752,7 @@ char* sd_img_gen_params_to_str(const sd_img_gen_params_t* sd_img_gen_params) {
 }
 
 void sd_vid_gen_params_init(sd_vid_gen_params_t* sd_vid_gen_params) {
-    *sd_vid_gen_params = {};
+    memset((void*)sd_vid_gen_params, 0, sizeof(sd_vid_gen_params_t));
     sd_sample_params_init(&sd_vid_gen_params->sample_params);
     sd_sample_params_init(&sd_vid_gen_params->high_noise_sample_params);
     sd_vid_gen_params->high_noise_sample_params.sample_steps = -1;
@@ -1773,7 +1776,6 @@ sd_ctx_t* new_sd_ctx(const sd_ctx_params_t* sd_ctx_params) {
 
     sd_ctx->sd = new StableDiffusionGGML();
     if (sd_ctx->sd == NULL) {
-        free(sd_ctx);
         return NULL;
     }
 
@@ -2376,7 +2378,7 @@ sd_image_t* generate_image(sd_ctx_t* sd_ctx, const sd_img_gen_params_t* sd_img_g
                                                         sd_img_gen_params->control_strength,
                                                         sd_img_gen_params->style_strength,
                                                         sd_img_gen_params->normalize_input,
-                                                        SAFE_STR(sd_img_gen_params->input_id_images_path),
+                                                        sd_img_gen_params->input_id_images_path,
                                                         ref_latents,
                                                         sd_img_gen_params->increase_ref_index,
                                                         concat_latent,

From ecf193aee3bc383fa23011aa7b293cac967799c7 Mon Sep 17 00:00:00 2001
From: rmatif <kingrealriadh@gmail.com>
Date: Sat, 13 Sep 2025 10:05:16 +0000
Subject: [PATCH 2/2] rebase

---
 stable-diffusion.cpp | 37 ++++++++++++++++++++++++++-----------
 1 file changed, 26 insertions(+), 11 deletions(-)

diff --git a/stable-diffusion.cpp b/stable-diffusion.cpp
index 0c0e4c828..4af3d6ab3 100644
--- a/stable-diffusion.cpp
+++ b/stable-diffusion.cpp
@@ -145,7 +145,6 @@ class StableDiffusionGGML {
 #endif
 #ifdef SD_USE_METAL
         LOG_DEBUG("Using Metal backend");
-        ggml_log_set(ggml_log_callback_default, nullptr);
         backend = ggml_backend_metal_init();
 #endif
 #ifdef SD_USE_VULKAN
@@ -192,6 +191,8 @@ class StableDiffusionGGML {
             rng = std::make_shared<PhiloxRNG>();
         }
 
+        ggml_log_set(ggml_log_callback_default, nullptr);
+
         init_backend();
 
         ModelLoader model_loader;
@@ -330,7 +331,7 @@ class StableDiffusionGGML {
             if (sd_version_is_dit(version)) {
                 use_t5xxl = true;
             }
-            if (!ggml_backend_is_cpu(backend) && use_t5xxl) {
+            if (!clip_on_cpu && !ggml_backend_is_cpu(backend) && use_t5xxl) {
                 LOG_WARN(
                     "!!!It appears that you are using the T5 model. Some backends may encounter issues with it."
                     "If you notice that the generated images are completely black,"
@@ -344,14 +345,12 @@ class StableDiffusionGGML {
                 LOG_INFO("Using flash attention in the diffusion model");
             }
             if (sd_version_is_sd3(version)) {
-                if (sd_ctx_params->diffusion_flash_attn) {
-                    LOG_WARN("flash attention in this diffusion model is currently unsupported!");
-                }
                 cond_stage_model = std::make_shared<SD3CLIPEmbedder>(clip_backend,
                                                                      offload_params_to_cpu,
                                                                      model_loader.tensor_storages_types);
                 diffusion_model  = std::make_shared<MMDiTModel>(backend,
                                                                offload_params_to_cpu,
+                                                               sd_ctx_params->diffusion_flash_attn,
                                                                model_loader.tensor_storages_types);
             } else if (sd_version_is_flux(version)) {
                 bool is_chroma = false;
@@ -362,6 +361,15 @@ class StableDiffusionGGML {
                     }
                 }
                 if (is_chroma) {
+                    if (sd_ctx_params->diffusion_flash_attn && sd_ctx_params->chroma_use_dit_mask) {
+                        LOG_WARN(
+                            "!!!It looks like you are using Chroma with flash attention. "
+                            "This is currently unsupported. "
+                            "If you find that the generated images are broken, "
+                            "try either disabling flash attention or specifying "
+                            "--chroma-disable-dit-mask as a workaround.");
+                    }
+
                     cond_stage_model = std::make_shared<T5CLIPEmbedder>(clip_backend,
                                                                         offload_params_to_cpu,
                                                                         model_loader.tensor_storages_types,
@@ -573,7 +581,7 @@ class StableDiffusionGGML {
         if (version == VERSION_SVD) {
             ignore_tensors.insert("conditioner.embedders.3");
         }
-        bool success = model_loader.load_tensors(tensors, ignore_tensors, n_threads);
+        bool success = model_loader.load_tensors(tensors, ignore_tensors);
         if (!success) {
             LOG_ERROR("load tensors from model loader failed");
             ggml_free(ctx);
@@ -744,6 +752,10 @@ class StableDiffusionGGML {
                 denoiser->scheduler          = std::make_shared<GITSSchedule>();
                 denoiser->scheduler->version = version;
                 break;
+            case SMOOTHSTEP:
+                LOG_INFO("Running with SmoothStep scheduler");
+                denoiser->scheduler = std::make_shared<SmoothStepSchedule>();
+                break;
             case DEFAULT:
                 // Don't touch anything.
                 break;
@@ -1544,6 +1556,7 @@ const char* schedule_to_str[] = {
     "exponential",
     "ays",
     "gits",
+    "smoothstep",
 };
 
 const char* sd_schedule_name(enum scheduler_t scheduler) {
@@ -1563,7 +1576,7 @@ enum scheduler_t str_to_schedule(const char* str) {
 }
 
 void sd_ctx_params_init(sd_ctx_params_t* sd_ctx_params) {
-    memset((void*)sd_ctx_params, 0, sizeof(sd_ctx_params_t));
+    *sd_ctx_params                         = {};
     sd_ctx_params->vae_decode_only         = true;
     sd_ctx_params->vae_tiling              = false;
     sd_ctx_params->free_params_immediately = true;
@@ -1647,6 +1660,7 @@ char* sd_ctx_params_to_str(const sd_ctx_params_t* sd_ctx_params) {
 }
 
 void sd_sample_params_init(sd_sample_params_t* sample_params) {
+    *sample_params                             = {};
     sample_params->guidance.txt_cfg            = 7.0f;
     sample_params->guidance.img_cfg            = INFINITY;
     sample_params->guidance.distilled_guidance = 3.5f;
@@ -1693,9 +1707,9 @@ char* sd_sample_params_to_str(const sd_sample_params_t* sample_params) {
 }
 
 void sd_img_gen_params_init(sd_img_gen_params_t* sd_img_gen_params) {
-    memset((void*)sd_img_gen_params, 0, sizeof(sd_img_gen_params_t));
-    sd_img_gen_params->clip_skip = -1;
+    *sd_img_gen_params = {};
     sd_sample_params_init(&sd_img_gen_params->sample_params);
+    sd_img_gen_params->clip_skip        = -1;
     sd_img_gen_params->ref_images_count = 0;
     sd_img_gen_params->width            = 512;
     sd_img_gen_params->height           = 512;
@@ -1752,7 +1766,7 @@ char* sd_img_gen_params_to_str(const sd_img_gen_params_t* sd_img_gen_params) {
 }
 
 void sd_vid_gen_params_init(sd_vid_gen_params_t* sd_vid_gen_params) {
-    memset((void*)sd_vid_gen_params, 0, sizeof(sd_vid_gen_params_t));
+    *sd_vid_gen_params = {};
     sd_sample_params_init(&sd_vid_gen_params->sample_params);
     sd_sample_params_init(&sd_vid_gen_params->high_noise_sample_params);
     sd_vid_gen_params->high_noise_sample_params.sample_steps = -1;
@@ -1776,6 +1790,7 @@ sd_ctx_t* new_sd_ctx(const sd_ctx_params_t* sd_ctx_params) {
 
     sd_ctx->sd = new StableDiffusionGGML();
     if (sd_ctx->sd == NULL) {
+        free(sd_ctx);
         return NULL;
     }
 
@@ -2378,7 +2393,7 @@ sd_image_t* generate_image(sd_ctx_t* sd_ctx, const sd_img_gen_params_t* sd_img_g
                                                         sd_img_gen_params->control_strength,
                                                         sd_img_gen_params->style_strength,
                                                         sd_img_gen_params->normalize_input,
-                                                        sd_img_gen_params->input_id_images_path,
+                                                        SAFE_STR(sd_img_gen_params->input_id_images_path),
                                                         ref_latents,
                                                         sd_img_gen_params->increase_ref_index,
                                                         concat_latent,