From bc1551b40b3530e8672b9cbe32667270ef052164 Mon Sep 17 00:00:00 2001 From: rmatif Date: Sat, 13 Sep 2025 09:48:44 +0000 Subject: [PATCH 1/2] allow the use of absolute paths for lora and embeddings --- conditioner.hpp | 312 ++++++++++++++++++++++++++++++++++--------- stable-diffusion.cpp | 62 ++++----- 2 files changed, 283 insertions(+), 91 deletions(-) diff --git a/conditioner.hpp b/conditioner.hpp index cfd2b4ca7..0f8edac1d 100644 --- a/conditioner.hpp +++ b/conditioner.hpp @@ -196,27 +196,91 @@ struct FrozenCLIPEmbedderWithCustomWords : public Conditioner { } std::vector convert_token_to_id(std::string text) { + size_t search_pos = 0; auto on_new_token_cb = [&](std::string& str, std::vector& bpe_tokens) -> bool { - size_t word_end = str.find(","); - std::string embd_name = word_end == std::string::npos ? str : str.substr(0, word_end); - embd_name = trim(embd_name); - std::string embd_path = get_full_path(embd_dir, embd_name + ".pt"); - if (embd_path.size() == 0) { - embd_path = get_full_path(embd_dir, embd_name + ".ckpt"); + std::string token_str; + size_t consumed_len = 0; + bool is_embed_tag = false; + + // The tokenizer gives us chunks of text. We only process the first potential embedding token in that chunk. + std::string trimmed_str = trim(str); + size_t leading_spaces = str.length() - trimmed_str.length(); + + if (starts_with(trimmed_str, ""); + if (tag_end == std::string::npos) { + return false; // Incomplete tag. + } + std::string lower_tag = trimmed_str.substr(0, tag_end + 1); + token_str = lower_tag; // Fallback to lowercased version + + if (text.length() >= lower_tag.length()) { + for (size_t i = search_pos; i <= text.length() - lower_tag.length(); ++i) { + bool match = true; + for (size_t j = 0; j < lower_tag.length(); ++j) { + if (std::tolower(text[i + j]) != lower_tag[j]) { + match = false; + break; + } + } + if (match) { + token_str = text.substr(i, lower_tag.length()); + search_pos = i + token_str.length(); + break; + } + } + } + consumed_len = leading_spaces + token_str.length(); + is_embed_tag = true; + } else { + // Not a tag. Could be a plain trigger word. + size_t first_delim = trimmed_str.find_first_of(" ,"); + token_str = (first_delim == std::string::npos) ? trimmed_str : trimmed_str.substr(0, first_delim); + consumed_len = leading_spaces + token_str.length(); + } + + std::string embd_name = trim(token_str); + if (is_embed_tag) { + embd_name = embd_name.substr(strlen(" 0) { if (load_embedding(embd_name, embd_path, bpe_tokens)) { - if (word_end != std::string::npos) { - str = str.substr(word_end); - } else { - str = ""; - } + str = str.substr(consumed_len); return true; } } + + if (is_embed_tag) { + LOG_WARN("could not load embedding '%s'", embd_name.c_str()); + str = str.substr(consumed_len); + return true; // Consume the failed tag so the tokenizer doesn't try to parse it as text. + } + + // It was not a tag and we couldn't find a file for it as a trigger word. return false; }; std::vector curr_tokens = tokenizer.encode(text, on_new_token_cb); @@ -245,30 +309,6 @@ struct FrozenCLIPEmbedderWithCustomWords : public Conditioner { LOG_DEBUG("parse '%s' to %s", text.c_str(), ss.str().c_str()); } - auto on_new_token_cb = [&](std::string& str, std::vector& bpe_tokens) -> bool { - size_t word_end = str.find(","); - std::string embd_name = word_end == std::string::npos ? str : str.substr(0, word_end); - embd_name = trim(embd_name); - std::string embd_path = get_full_path(embd_dir, embd_name + ".pt"); - if (embd_path.size() == 0) { - embd_path = get_full_path(embd_dir, embd_name + ".ckpt"); - } - if (embd_path.size() == 0) { - embd_path = get_full_path(embd_dir, embd_name + ".safetensors"); - } - if (embd_path.size() > 0) { - if (load_embedding(embd_name, embd_path, bpe_tokens)) { - if (word_end != std::string::npos) { - str = str.substr(word_end); - } else { - str = ""; - } - return true; - } - } - return false; - }; - std::vector tokens; std::vector weights; std::vector class_token_mask; @@ -278,6 +318,93 @@ struct FrozenCLIPEmbedderWithCustomWords : public Conditioner { std::vector clean_input_ids; const std::string& curr_text = item.first; float curr_weight = item.second; + size_t search_pos = 0; + auto on_new_token_cb = [&](std::string& str, std::vector& bpe_tokens) -> bool { + std::string token_str; + size_t consumed_len = 0; + bool is_embed_tag = false; + + // The tokenizer gives us chunks of text. We only process the first potential embedding token in that chunk. + std::string trimmed_str = trim(str); + size_t leading_spaces = str.length() - trimmed_str.length(); + + if (starts_with(trimmed_str, ""); + if (tag_end == std::string::npos) { + return false; // Incomplete tag. + } + std::string lower_tag = trimmed_str.substr(0, tag_end + 1); + token_str = lower_tag; // Fallback to lowercased version + + if (curr_text.length() >= lower_tag.length()) { + for (size_t i = search_pos; i <= curr_text.length() - lower_tag.length(); ++i) { + bool match = true; + for (size_t j = 0; j < lower_tag.length(); ++j) { + if (std::tolower(curr_text[i + j]) != lower_tag[j]) { + match = false; + break; + } + } + if (match) { + token_str = curr_text.substr(i, lower_tag.length()); + search_pos = i + token_str.length(); + break; + } + } + } + consumed_len = leading_spaces + token_str.length(); + is_embed_tag = true; + } else { + // Not a tag. Could be a plain trigger word. + size_t first_delim = trimmed_str.find_first_of(" ,"); + token_str = (first_delim == std::string::npos) ? trimmed_str : trimmed_str.substr(0, first_delim); + consumed_len = leading_spaces + token_str.length(); + } + + std::string embd_name = trim(token_str); + if (is_embed_tag) { + embd_name = embd_name.substr(strlen(" 0) { + if (load_embedding(embd_name, embd_path, bpe_tokens)) { + str = str.substr(consumed_len); + return true; + } + } + + if (is_embed_tag) { + LOG_WARN("could not load embedding '%s'", embd_name.c_str()); + str = str.substr(consumed_len); + return true; // Consume the failed tag so the tokenizer doesn't try to parse it as text. + } + + // It was not a tag and we couldn't find a file for it as a trigger word. + return false; + }; // printf(" %s: %f \n", curr_text.c_str(), curr_weight); std::vector curr_tokens = tokenizer.encode(curr_text, on_new_token_cb); int32_t clean_index = 0; @@ -359,35 +486,98 @@ struct FrozenCLIPEmbedderWithCustomWords : public Conditioner { LOG_DEBUG("parse '%s' to %s", text.c_str(), ss.str().c_str()); } - auto on_new_token_cb = [&](std::string& str, std::vector& bpe_tokens) -> bool { - size_t word_end = str.find(","); - std::string embd_name = word_end == std::string::npos ? str : str.substr(0, word_end); - embd_name = trim(embd_name); - std::string embd_path = get_full_path(embd_dir, embd_name + ".pt"); - if (embd_path.size() == 0) { - embd_path = get_full_path(embd_dir, embd_name + ".ckpt"); - } - if (embd_path.size() == 0) { - embd_path = get_full_path(embd_dir, embd_name + ".safetensors"); - } - if (embd_path.size() > 0) { - if (load_embedding(embd_name, embd_path, bpe_tokens)) { - if (word_end != std::string::npos) { - str = str.substr(word_end); - } else { - str = ""; - } - return true; - } - } - return false; - }; - std::vector tokens; std::vector weights; for (const auto& item : parsed_attention) { const std::string& curr_text = item.first; float curr_weight = item.second; + size_t search_pos = 0; + auto on_new_token_cb = [&](std::string& str, std::vector& bpe_tokens) -> bool { + std::string token_str; + size_t consumed_len = 0; + bool is_embed_tag = false; + + // The tokenizer gives us chunks of text. We only process the first potential embedding token in that chunk. + std::string trimmed_str = trim(str); + size_t leading_spaces = str.length() - trimmed_str.length(); + + if (starts_with(trimmed_str, ""); + if (tag_end == std::string::npos) { + return false; // Incomplete tag. + } + std::string lower_tag = trimmed_str.substr(0, tag_end + 1); + token_str = lower_tag; // Fallback to lowercased version + + if (curr_text.length() >= lower_tag.length()) { + for (size_t i = search_pos; i <= curr_text.length() - lower_tag.length(); ++i) { + bool match = true; + for (size_t j = 0; j < lower_tag.length(); ++j) { + if (std::tolower(curr_text[i + j]) != lower_tag[j]) { + match = false; + break; + } + } + if (match) { + token_str = curr_text.substr(i, lower_tag.length()); + search_pos = i + token_str.length(); + break; + } + } + } + consumed_len = leading_spaces + token_str.length(); + is_embed_tag = true; + } else { + // Not a tag. Could be a plain trigger word. + size_t first_delim = trimmed_str.find_first_of(" ,"); + token_str = (first_delim == std::string::npos) ? trimmed_str : trimmed_str.substr(0, first_delim); + consumed_len = leading_spaces + token_str.length(); + } + + std::string embd_name = trim(token_str); + if (is_embed_tag) { + embd_name = embd_name.substr(strlen(" 0) { + if (load_embedding(embd_name, embd_path, bpe_tokens)) { + str = str.substr(consumed_len); + return true; + } + } + + if (is_embed_tag) { + LOG_WARN("could not load embedding '%s'", embd_name.c_str()); + str = str.substr(consumed_len); + return true; // Consume the failed tag so the tokenizer doesn't try to parse it as text. + } + + // It was not a tag and we couldn't find a file for it as a trigger word. + return false; + }; std::vector curr_tokens = tokenizer.encode(curr_text, on_new_token_cb); tokens.insert(tokens.end(), curr_tokens.begin(), curr_tokens.end()); weights.insert(weights.end(), curr_tokens.size(), curr_weight); diff --git a/stable-diffusion.cpp b/stable-diffusion.cpp index db4e07cb0..0c0e4c828 100644 --- a/stable-diffusion.cpp +++ b/stable-diffusion.cpp @@ -145,6 +145,7 @@ class StableDiffusionGGML { #endif #ifdef SD_USE_METAL LOG_DEBUG("Using Metal backend"); + ggml_log_set(ggml_log_callback_default, nullptr); backend = ggml_backend_metal_init(); #endif #ifdef SD_USE_VULKAN @@ -191,8 +192,6 @@ class StableDiffusionGGML { rng = std::make_shared(); } - ggml_log_set(ggml_log_callback_default, nullptr); - init_backend(); ModelLoader model_loader; @@ -331,7 +330,7 @@ class StableDiffusionGGML { if (sd_version_is_dit(version)) { use_t5xxl = true; } - if (!clip_on_cpu && !ggml_backend_is_cpu(backend) && use_t5xxl) { + if (!ggml_backend_is_cpu(backend) && use_t5xxl) { LOG_WARN( "!!!It appears that you are using the T5 model. Some backends may encounter issues with it." "If you notice that the generated images are completely black," @@ -345,12 +344,14 @@ class StableDiffusionGGML { LOG_INFO("Using flash attention in the diffusion model"); } if (sd_version_is_sd3(version)) { + if (sd_ctx_params->diffusion_flash_attn) { + LOG_WARN("flash attention in this diffusion model is currently unsupported!"); + } cond_stage_model = std::make_shared(clip_backend, offload_params_to_cpu, model_loader.tensor_storages_types); diffusion_model = std::make_shared(backend, offload_params_to_cpu, - sd_ctx_params->diffusion_flash_attn, model_loader.tensor_storages_types); } else if (sd_version_is_flux(version)) { bool is_chroma = false; @@ -361,15 +362,6 @@ class StableDiffusionGGML { } } if (is_chroma) { - if (sd_ctx_params->diffusion_flash_attn && sd_ctx_params->chroma_use_dit_mask) { - LOG_WARN( - "!!!It looks like you are using Chroma with flash attention. " - "This is currently unsupported. " - "If you find that the generated images are broken, " - "try either disabling flash attention or specifying " - "--chroma-disable-dit-mask as a workaround."); - } - cond_stage_model = std::make_shared(clip_backend, offload_params_to_cpu, model_loader.tensor_storages_types, @@ -581,7 +573,7 @@ class StableDiffusionGGML { if (version == VERSION_SVD) { ignore_tensors.insert("conditioner.embedders.3"); } - bool success = model_loader.load_tensors(tensors, ignore_tensors); + bool success = model_loader.load_tensors(tensors, ignore_tensors, n_threads); if (!success) { LOG_ERROR("load tensors from model loader failed"); ggml_free(ctx); @@ -752,10 +744,6 @@ class StableDiffusionGGML { denoiser->scheduler = std::make_shared(); denoiser->scheduler->version = version; break; - case SMOOTHSTEP: - LOG_INFO("Running with SmoothStep scheduler"); - denoiser->scheduler = std::make_shared(); - break; case DEFAULT: // Don't touch anything. break; @@ -810,17 +798,34 @@ class StableDiffusionGGML { is_high_noise = true; LOG_DEBUG("high noise lora: %s", lora_name.c_str()); } - std::string st_file_path = path_join(lora_model_dir, lora_name + ".safetensors"); - std::string ckpt_file_path = path_join(lora_model_dir, lora_name + ".ckpt"); + std::string st_file_path; + std::string ckpt_file_path; std::string file_path; - if (file_exists(st_file_path)) { + bool is_path = contains(lora_name, "/") || contains(lora_name, "\\"); + + if (is_path) { + st_file_path = lora_name + ".safetensors"; + ckpt_file_path = lora_name + ".ckpt"; + } else { + st_file_path = path_join(lora_model_dir, lora_name + ".safetensors"); + ckpt_file_path = path_join(lora_model_dir, lora_name + ".ckpt"); + } + + if (is_path && file_exists(lora_name)) { + file_path = lora_name; + } else if (file_exists(st_file_path)) { file_path = st_file_path; } else if (file_exists(ckpt_file_path)) { file_path = ckpt_file_path; } else { - LOG_WARN("can not find %s or %s for lora %s", st_file_path.c_str(), ckpt_file_path.c_str(), lora_name.c_str()); + if (is_path) { + LOG_WARN("can not find lora file %s, %s or %s", lora_name.c_str(), st_file_path.c_str(), ckpt_file_path.c_str()); + } else { + LOG_WARN("can not find %s or %s for lora %s", st_file_path.c_str(), ckpt_file_path.c_str(), lora_name.c_str()); + } return; } + LoraModel lora(backend, file_path, is_high_noise ? "model.high_noise_" : ""); if (!lora.load_from_file()) { LOG_WARN("load lora tensors from %s failed", file_path.c_str()); @@ -1539,7 +1544,6 @@ const char* schedule_to_str[] = { "exponential", "ays", "gits", - "smoothstep", }; const char* sd_schedule_name(enum scheduler_t scheduler) { @@ -1559,7 +1563,7 @@ enum scheduler_t str_to_schedule(const char* str) { } void sd_ctx_params_init(sd_ctx_params_t* sd_ctx_params) { - *sd_ctx_params = {}; + memset((void*)sd_ctx_params, 0, sizeof(sd_ctx_params_t)); sd_ctx_params->vae_decode_only = true; sd_ctx_params->vae_tiling = false; sd_ctx_params->free_params_immediately = true; @@ -1643,7 +1647,6 @@ char* sd_ctx_params_to_str(const sd_ctx_params_t* sd_ctx_params) { } void sd_sample_params_init(sd_sample_params_t* sample_params) { - *sample_params = {}; sample_params->guidance.txt_cfg = 7.0f; sample_params->guidance.img_cfg = INFINITY; sample_params->guidance.distilled_guidance = 3.5f; @@ -1690,9 +1693,9 @@ char* sd_sample_params_to_str(const sd_sample_params_t* sample_params) { } void sd_img_gen_params_init(sd_img_gen_params_t* sd_img_gen_params) { - *sd_img_gen_params = {}; + memset((void*)sd_img_gen_params, 0, sizeof(sd_img_gen_params_t)); + sd_img_gen_params->clip_skip = -1; sd_sample_params_init(&sd_img_gen_params->sample_params); - sd_img_gen_params->clip_skip = -1; sd_img_gen_params->ref_images_count = 0; sd_img_gen_params->width = 512; sd_img_gen_params->height = 512; @@ -1749,7 +1752,7 @@ char* sd_img_gen_params_to_str(const sd_img_gen_params_t* sd_img_gen_params) { } void sd_vid_gen_params_init(sd_vid_gen_params_t* sd_vid_gen_params) { - *sd_vid_gen_params = {}; + memset((void*)sd_vid_gen_params, 0, sizeof(sd_vid_gen_params_t)); sd_sample_params_init(&sd_vid_gen_params->sample_params); sd_sample_params_init(&sd_vid_gen_params->high_noise_sample_params); sd_vid_gen_params->high_noise_sample_params.sample_steps = -1; @@ -1773,7 +1776,6 @@ sd_ctx_t* new_sd_ctx(const sd_ctx_params_t* sd_ctx_params) { sd_ctx->sd = new StableDiffusionGGML(); if (sd_ctx->sd == NULL) { - free(sd_ctx); return NULL; } @@ -2376,7 +2378,7 @@ sd_image_t* generate_image(sd_ctx_t* sd_ctx, const sd_img_gen_params_t* sd_img_g sd_img_gen_params->control_strength, sd_img_gen_params->style_strength, sd_img_gen_params->normalize_input, - SAFE_STR(sd_img_gen_params->input_id_images_path), + sd_img_gen_params->input_id_images_path, ref_latents, sd_img_gen_params->increase_ref_index, concat_latent, From ecf193aee3bc383fa23011aa7b293cac967799c7 Mon Sep 17 00:00:00 2001 From: rmatif Date: Sat, 13 Sep 2025 10:05:16 +0000 Subject: [PATCH 2/2] rebase --- stable-diffusion.cpp | 37 ++++++++++++++++++++++++++----------- 1 file changed, 26 insertions(+), 11 deletions(-) diff --git a/stable-diffusion.cpp b/stable-diffusion.cpp index 0c0e4c828..4af3d6ab3 100644 --- a/stable-diffusion.cpp +++ b/stable-diffusion.cpp @@ -145,7 +145,6 @@ class StableDiffusionGGML { #endif #ifdef SD_USE_METAL LOG_DEBUG("Using Metal backend"); - ggml_log_set(ggml_log_callback_default, nullptr); backend = ggml_backend_metal_init(); #endif #ifdef SD_USE_VULKAN @@ -192,6 +191,8 @@ class StableDiffusionGGML { rng = std::make_shared(); } + ggml_log_set(ggml_log_callback_default, nullptr); + init_backend(); ModelLoader model_loader; @@ -330,7 +331,7 @@ class StableDiffusionGGML { if (sd_version_is_dit(version)) { use_t5xxl = true; } - if (!ggml_backend_is_cpu(backend) && use_t5xxl) { + if (!clip_on_cpu && !ggml_backend_is_cpu(backend) && use_t5xxl) { LOG_WARN( "!!!It appears that you are using the T5 model. Some backends may encounter issues with it." "If you notice that the generated images are completely black," @@ -344,14 +345,12 @@ class StableDiffusionGGML { LOG_INFO("Using flash attention in the diffusion model"); } if (sd_version_is_sd3(version)) { - if (sd_ctx_params->diffusion_flash_attn) { - LOG_WARN("flash attention in this diffusion model is currently unsupported!"); - } cond_stage_model = std::make_shared(clip_backend, offload_params_to_cpu, model_loader.tensor_storages_types); diffusion_model = std::make_shared(backend, offload_params_to_cpu, + sd_ctx_params->diffusion_flash_attn, model_loader.tensor_storages_types); } else if (sd_version_is_flux(version)) { bool is_chroma = false; @@ -362,6 +361,15 @@ class StableDiffusionGGML { } } if (is_chroma) { + if (sd_ctx_params->diffusion_flash_attn && sd_ctx_params->chroma_use_dit_mask) { + LOG_WARN( + "!!!It looks like you are using Chroma with flash attention. " + "This is currently unsupported. " + "If you find that the generated images are broken, " + "try either disabling flash attention or specifying " + "--chroma-disable-dit-mask as a workaround."); + } + cond_stage_model = std::make_shared(clip_backend, offload_params_to_cpu, model_loader.tensor_storages_types, @@ -573,7 +581,7 @@ class StableDiffusionGGML { if (version == VERSION_SVD) { ignore_tensors.insert("conditioner.embedders.3"); } - bool success = model_loader.load_tensors(tensors, ignore_tensors, n_threads); + bool success = model_loader.load_tensors(tensors, ignore_tensors); if (!success) { LOG_ERROR("load tensors from model loader failed"); ggml_free(ctx); @@ -744,6 +752,10 @@ class StableDiffusionGGML { denoiser->scheduler = std::make_shared(); denoiser->scheduler->version = version; break; + case SMOOTHSTEP: + LOG_INFO("Running with SmoothStep scheduler"); + denoiser->scheduler = std::make_shared(); + break; case DEFAULT: // Don't touch anything. break; @@ -1544,6 +1556,7 @@ const char* schedule_to_str[] = { "exponential", "ays", "gits", + "smoothstep", }; const char* sd_schedule_name(enum scheduler_t scheduler) { @@ -1563,7 +1576,7 @@ enum scheduler_t str_to_schedule(const char* str) { } void sd_ctx_params_init(sd_ctx_params_t* sd_ctx_params) { - memset((void*)sd_ctx_params, 0, sizeof(sd_ctx_params_t)); + *sd_ctx_params = {}; sd_ctx_params->vae_decode_only = true; sd_ctx_params->vae_tiling = false; sd_ctx_params->free_params_immediately = true; @@ -1647,6 +1660,7 @@ char* sd_ctx_params_to_str(const sd_ctx_params_t* sd_ctx_params) { } void sd_sample_params_init(sd_sample_params_t* sample_params) { + *sample_params = {}; sample_params->guidance.txt_cfg = 7.0f; sample_params->guidance.img_cfg = INFINITY; sample_params->guidance.distilled_guidance = 3.5f; @@ -1693,9 +1707,9 @@ char* sd_sample_params_to_str(const sd_sample_params_t* sample_params) { } void sd_img_gen_params_init(sd_img_gen_params_t* sd_img_gen_params) { - memset((void*)sd_img_gen_params, 0, sizeof(sd_img_gen_params_t)); - sd_img_gen_params->clip_skip = -1; + *sd_img_gen_params = {}; sd_sample_params_init(&sd_img_gen_params->sample_params); + sd_img_gen_params->clip_skip = -1; sd_img_gen_params->ref_images_count = 0; sd_img_gen_params->width = 512; sd_img_gen_params->height = 512; @@ -1752,7 +1766,7 @@ char* sd_img_gen_params_to_str(const sd_img_gen_params_t* sd_img_gen_params) { } void sd_vid_gen_params_init(sd_vid_gen_params_t* sd_vid_gen_params) { - memset((void*)sd_vid_gen_params, 0, sizeof(sd_vid_gen_params_t)); + *sd_vid_gen_params = {}; sd_sample_params_init(&sd_vid_gen_params->sample_params); sd_sample_params_init(&sd_vid_gen_params->high_noise_sample_params); sd_vid_gen_params->high_noise_sample_params.sample_steps = -1; @@ -1776,6 +1790,7 @@ sd_ctx_t* new_sd_ctx(const sd_ctx_params_t* sd_ctx_params) { sd_ctx->sd = new StableDiffusionGGML(); if (sd_ctx->sd == NULL) { + free(sd_ctx); return NULL; } @@ -2378,7 +2393,7 @@ sd_image_t* generate_image(sd_ctx_t* sd_ctx, const sd_img_gen_params_t* sd_img_g sd_img_gen_params->control_strength, sd_img_gen_params->style_strength, sd_img_gen_params->normalize_input, - sd_img_gen_params->input_id_images_path, + SAFE_STR(sd_img_gen_params->input_id_images_path), ref_latents, sd_img_gen_params->increase_ref_index, concat_latent,