Skip to content
Open
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
128 changes: 76 additions & 52 deletions stable-diffusion.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -111,6 +111,8 @@ class StableDiffusionGGML {
bool is_using_v_parameterization = false;
bool is_using_edm_v_parameterization = false;

bool has_vision = false;

std::map<std::string, struct ggml_tensor*> tensors;

std::string lora_model_dir;
Expand Down Expand Up @@ -412,6 +414,7 @@ class StableDiffusionGGML {
if (!vae_decode_only) {
enable_vision = true;
}
has_vision = enable_vision;
cond_stage_model = std::make_shared<Qwen2_5_VLCLIPEmbedder>(clip_backend,
offload_params_to_cpu,
model_loader.tensor_storages_types,
Expand Down Expand Up @@ -1116,7 +1119,7 @@ class StableDiffusionGGML {
ggml_tensor* noise,
SDCondition cond,
SDCondition uncond,
SDCondition img_cond,
SDCondition img_uncond,
ggml_tensor* control_hint,
float control_strength,
sd_guidance_params_t guidance,
Expand All @@ -1141,9 +1144,10 @@ class StableDiffusionGGML {
float img_cfg_scale = std::isfinite(guidance.img_cfg) ? guidance.img_cfg : guidance.txt_cfg;
float slg_scale = guidance.slg.scale;

if (img_cfg_scale != cfg_scale && !sd_version_is_inpaint_or_unet_edit(version)) {
if (img_cfg_scale != 1.0 && !sd_version_is_inpaint_or_unet_edit(version)
&& (version != VERSION_FLUX || ref_latents.size()==0) && (version != VERSION_QWEN_IMAGE || ref_latents.size()==0)) {
LOG_WARN("2-conditioning CFG is not supported with this model, disabling it for better performance...");
img_cfg_scale = cfg_scale;
img_cfg_scale = 1.0f;
}

size_t steps = sigmas.size() - 1;
Expand All @@ -1156,16 +1160,22 @@ class StableDiffusionGGML {

struct ggml_tensor* noised_input = ggml_dup_tensor(work_ctx, x);

bool has_unconditioned = img_cfg_scale != 1.0 && uncond.c_crossattn != nullptr;
bool has_img_cond = cfg_scale != img_cfg_scale && img_cond.c_crossattn != nullptr;
bool has_skiplayer = slg_scale != 0.0 && skip_layers.size() > 0;
bool has_conditionned = (has_skiplayer || cfg_scale != 0.0) && cond.c_crossattn != nullptr;
bool has_unconditioned = cfg_scale != img_cfg_scale && uncond.c_crossattn != nullptr;
bool has_img_uncond = img_cfg_scale != 1.0 && img_uncond.c_crossattn != nullptr;

GGML_ASSERT(has_conditionned || has_unconditioned || has_img_uncond);

// denoise wrapper
struct ggml_tensor* out_cond = ggml_dup_tensor(work_ctx, x);
struct ggml_tensor* out_cond = x;
struct ggml_tensor* out_uncond = nullptr;
struct ggml_tensor* out_skip = nullptr;
struct ggml_tensor* out_img_cond = nullptr;

if (has_conditionned) {
out_cond = ggml_dup_tensor(work_ctx, x);
}
if (has_unconditioned) {
out_uncond = ggml_dup_tensor(work_ctx, x);
}
Expand All @@ -1177,7 +1187,7 @@ class StableDiffusionGGML {
LOG_WARN("SLG is incompatible with %s models", model_version_to_str[version]);
}
}
if (has_img_cond) {
if (has_img_uncond) {
out_img_cond = ggml_dup_tensor(work_ctx, x);
}
struct ggml_tensor* denoised = ggml_dup_tensor(work_ctx, x);
Expand Down Expand Up @@ -1240,21 +1250,23 @@ class StableDiffusionGGML {
diffusion_params.vace_context = vace_context;
diffusion_params.vace_strength = vace_strength;

if (start_merge_step == -1 || step <= start_merge_step) {
// cond
diffusion_params.context = cond.c_crossattn;
diffusion_params.c_concat = cond.c_concat;
diffusion_params.y = cond.c_vector;
work_diffusion_model->compute(n_threads,
diffusion_params,
&out_cond);
} else {
diffusion_params.context = id_cond.c_crossattn;
diffusion_params.c_concat = cond.c_concat;
diffusion_params.y = id_cond.c_vector;
work_diffusion_model->compute(n_threads,
diffusion_params,
&out_cond);
if (has_conditionned) {
if (start_merge_step == -1 || step <= start_merge_step) {
// cond
diffusion_params.context = cond.c_crossattn;
diffusion_params.c_concat = cond.c_concat;
diffusion_params.y = cond.c_vector;
work_diffusion_model->compute(n_threads,
diffusion_params,
&out_cond);
} else {
diffusion_params.context = id_cond.c_crossattn;
diffusion_params.c_concat = cond.c_concat;
diffusion_params.y = id_cond.c_vector;
work_diffusion_model->compute(n_threads,
diffusion_params,
&out_cond);
}
}

float* negative_data = nullptr;
Expand All @@ -1274,15 +1286,16 @@ class StableDiffusionGGML {
negative_data = (float*)out_uncond->data;
}

float* img_cond_data = nullptr;
if (has_img_cond) {
diffusion_params.context = img_cond.c_crossattn;
diffusion_params.c_concat = img_cond.c_concat;
diffusion_params.y = img_cond.c_vector;
float* img_uncond_data = nullptr;
if (has_img_uncond) {
diffusion_params.ref_latents = {};
diffusion_params.context = img_uncond.c_crossattn;
diffusion_params.c_concat = img_uncond.c_concat;
diffusion_params.y = img_uncond.c_vector;
work_diffusion_model->compute(n_threads,
diffusion_params,
&out_img_cond);
img_cond_data = (float*)out_img_cond->data;
img_uncond_data = (float*)out_img_cond->data;
}

int step_count = sigmas.size();
Expand Down Expand Up @@ -1321,19 +1334,19 @@ class StableDiffusionGGML {
float latent_result = positive_data[i];
if (has_unconditioned) {
// out_uncond + cfg_scale * (out_cond - out_uncond)
if (has_img_cond) {
// out_uncond + text_cfg_scale * (out_cond - out_img_cond) + image_cfg_scale * (out_img_cond - out_uncond)
latent_result = negative_data[i] + img_cfg_scale * (img_cond_data[i] - negative_data[i]) + cfg_scale * (positive_data[i] - img_cond_data[i]);
if (has_img_uncond) {
// out_uncond + text_cfg_scale * (out_cond - out_txt_uncond) + image_cfg_scale * (out_txt_uncond - out_txtimg_uncond)
latent_result = img_uncond_data[i] + img_cfg_scale * (negative_data[i] - img_uncond_data[i]) + cfg_scale * (positive_data[i] - negative_data[i]);
} else {
// img_cfg_scale == cfg_scale
// img_cfg_scale == 1
latent_result = negative_data[i] + cfg_scale * (positive_data[i] - negative_data[i]);
}
} else if (has_img_cond) {
// img_cfg_scale == 1
latent_result = img_cond_data[i] + cfg_scale * (positive_data[i] - img_cond_data[i]);
} else if (has_img_uncond) {
// img_cfg_scale == cfg_scale
latent_result = img_uncond_data[i] + cfg_scale * (positive_data[i] - img_uncond_data[i]);
}
if (is_skiplayer_step) {
latent_result = latent_result + (positive_data[i] - skip_layer_data[i]) * slg_scale;
latent_result = latent_result + slg_scale * (positive_data[i] - skip_layer_data[i]);
}
// v = latent_result, eps = latent_result
// denoised = (v * c_out + input * c_skip) or (input + eps * c_out)
Expand Down Expand Up @@ -1977,7 +1990,7 @@ char* sd_sample_params_to_str(const sd_sample_params_t* sample_params) {
sample_params->guidance.txt_cfg,
std::isfinite(sample_params->guidance.img_cfg)
? sample_params->guidance.img_cfg
: sample_params->guidance.txt_cfg,
: 1.0f,
sample_params->guidance.distilled_guidance,
sample_params->guidance.slg.layer_count,
sample_params->guidance.slg.layer_start,
Expand Down Expand Up @@ -2142,7 +2155,8 @@ sd_image_t* generate_image_internal(sd_ctx_t* sd_ctx,
}

if (!std::isfinite(guidance.img_cfg)) {
guidance.img_cfg = guidance.txt_cfg;
// default to 1
guidance.img_cfg = 1.0f;
}

// for (auto v : sigmas) {
Expand Down Expand Up @@ -2250,7 +2264,7 @@ sd_image_t* generate_image_internal(sd_ctx_t* sd_ctx,

SDCondition uncond;
if (guidance.txt_cfg != 1.0 ||
(sd_version_is_inpaint_or_unet_edit(sd_ctx->sd->version) && guidance.txt_cfg != guidance.img_cfg)) {
(sd_version_is_inpaint_or_unet_edit(sd_ctx->sd->version) && guidance.txt_cfg != 1.0f)) {
bool zero_out_masked = false;
if (sd_version_is_sdxl(sd_ctx->sd->version) && negative_prompt.size() == 0 && !sd_ctx->sd->is_using_edm_v_parameterization) {
zero_out_masked = true;
Expand All @@ -2261,6 +2275,15 @@ sd_image_t* generate_image_internal(sd_ctx_t* sd_ctx,
sd_ctx->sd->n_threads,
condition_params);
}
SDCondition img_uncond = uncond;
if (uncond.c_crossattn != nullptr && guidance.img_cfg != 1.0 && sd_ctx->sd->has_vision && condition_params.ref_images.size() > 0) {
// Recompute negative conditionning without ref images
condition_params.ref_images = {};
img_uncond = sd_ctx->sd->cond_stage_model->get_learned_condition(work_ctx,
sd_ctx->sd->n_threads,
condition_params);
}

int64_t t1 = ggml_time_ms();
LOG_INFO("get_learned_condition completed, taking %" PRId64 " ms", t1 - t0);

Expand Down Expand Up @@ -2288,14 +2311,16 @@ sd_image_t* generate_image_internal(sd_ctx_t* sd_ctx,
ggml_ext_tensor_scale_inplace(control_latent, control_strength);
}

struct ggml_tensor* empty_latent;

if (sd_version_is_inpaint(sd_ctx->sd->version)) {
int64_t mask_channels = 1;
if (sd_ctx->sd->version == VERSION_FLUX_FILL) {
mask_channels = 8 * 8; // flatten the whole mask
} else if (sd_ctx->sd->version == VERSION_FLEX_2) {
mask_channels = 1 + init_latent->ne[2];
}
auto empty_latent = ggml_new_tensor_4d(work_ctx, GGML_TYPE_F32, init_latent->ne[0], init_latent->ne[1], mask_channels + init_latent->ne[2], 1);
empty_latent = ggml_new_tensor_4d(work_ctx, GGML_TYPE_F32, init_latent->ne[0], init_latent->ne[1], mask_channels + init_latent->ne[2], 1);
// no mask, set the whole image as masked
for (int64_t x = 0; x < empty_latent->ne[0]; x++) {
for (int64_t y = 0; y < empty_latent->ne[1]; y++) {
Expand Down Expand Up @@ -2345,31 +2370,30 @@ sd_image_t* generate_image_internal(sd_ctx_t* sd_ctx,
concat_latent = empty_latent;
}
cond.c_concat = concat_latent;
uncond.c_concat = empty_latent;
uncond.c_concat = concat_latent;
denoise_mask = nullptr;
} else if (sd_version_is_unet_edit(sd_ctx->sd->version)) {
auto empty_latent = ggml_dup_tensor(work_ctx, init_latent);
empty_latent = ggml_dup_tensor(work_ctx, init_latent);
ggml_set_f32(empty_latent, 0);
uncond.c_concat = empty_latent;
cond.c_concat = ref_latents[0];
cond.c_concat = ref_latents[0];
if (cond.c_concat == nullptr) {
cond.c_concat = empty_latent;
}
uncond.c_concat = cond.c_concat;
} else if (sd_version_is_control(sd_ctx->sd->version)) {
auto empty_latent = ggml_dup_tensor(work_ctx, init_latent);
empty_latent = ggml_dup_tensor(work_ctx, init_latent);
ggml_set_f32(empty_latent, 0);
uncond.c_concat = empty_latent;
if (sd_ctx->sd->control_net == nullptr) {
cond.c_concat = control_latent;
}
if (cond.c_concat == nullptr) {
cond.c_concat = empty_latent;
}
uncond.c_concat = cond.c_concat;
}
SDCondition img_cond;
if (uncond.c_crossattn != nullptr &&
(sd_version_is_inpaint_or_unet_edit(sd_ctx->sd->version) && guidance.txt_cfg != guidance.img_cfg)) {
img_cond = SDCondition(uncond.c_crossattn, uncond.c_vector, cond.c_concat);
(sd_version_is_inpaint_or_unet_edit(sd_ctx->sd->version) && guidance.img_cfg != 1.0)) {
img_uncond.c_concat = empty_latent;
}
for (int b = 0; b < batch_count; b++) {
int64_t sampling_start = ggml_time_ms();
Expand All @@ -2396,7 +2420,7 @@ sd_image_t* generate_image_internal(sd_ctx_t* sd_ctx,
noise,
cond,
uncond,
img_cond,
img_uncond,
image_hint,
control_strength,
guidance,
Expand Down Expand Up @@ -3027,7 +3051,7 @@ SD_API sd_image_t* generate_video(sd_ctx_t* sd_ctx, const sd_vid_gen_params_t* s
noise,
cond,
uncond,
{},
uncond,
nullptr,
0,
sd_vid_gen_params->high_noise_sample_params.guidance,
Expand Down Expand Up @@ -3063,7 +3087,7 @@ SD_API sd_image_t* generate_video(sd_ctx_t* sd_ctx, const sd_vid_gen_params_t* s
noise,
cond,
uncond,
{},
uncond,
nullptr,
0,
sd_vid_gen_params->sample_params.guidance,
Expand Down
Loading