Skip to content

Commit 6103d86

Browse files
authored
refactor: introduce GGMLRunnerContext (#928)
* introduce GGMLRunnerContext * add Flash Attention enable control through GGMLRunnerContext * add conv2d_direct enable control through GGMLRunnerContext
1 parent c42826b commit 6103d86

21 files changed

+1083
-1203
lines changed

clip.hpp

Lines changed: 44 additions & 47 deletions
Original file line numberDiff line numberDiff line change
@@ -451,16 +451,16 @@ struct CLIPMLP : public GGMLBlock {
451451
}
452452
}
453453

454-
struct ggml_tensor* forward(struct ggml_context* ctx, struct ggml_tensor* x) {
454+
struct ggml_tensor* forward(GGMLRunnerContext* ctx, struct ggml_tensor* x) {
455455
// x: [N, n_token, d_model]
456456
auto fc1 = std::dynamic_pointer_cast<Linear>(blocks["fc1"]);
457457
auto fc2 = std::dynamic_pointer_cast<Linear>(blocks["fc2"]);
458458

459459
x = fc1->forward(ctx, x);
460460
if (use_gelu) {
461-
x = ggml_gelu_inplace(ctx, x);
461+
x = ggml_gelu_inplace(ctx->ggml_ctx, x);
462462
} else {
463-
x = ggml_gelu_quick_inplace(ctx, x);
463+
x = ggml_gelu_quick_inplace(ctx->ggml_ctx, x);
464464
}
465465
x = fc2->forward(ctx, x);
466466
return x;
@@ -488,15 +488,15 @@ struct CLIPLayer : public GGMLBlock {
488488
blocks["mlp"] = std::shared_ptr<GGMLBlock>(new CLIPMLP(d_model, intermediate_size));
489489
}
490490

491-
struct ggml_tensor* forward(struct ggml_context* ctx, ggml_backend_t backend, struct ggml_tensor* x, bool mask = true) {
491+
struct ggml_tensor* forward(GGMLRunnerContext* ctx, struct ggml_tensor* x, bool mask = true) {
492492
// x: [N, n_token, d_model]
493493
auto self_attn = std::dynamic_pointer_cast<MultiheadAttention>(blocks["self_attn"]);
494494
auto layer_norm1 = std::dynamic_pointer_cast<LayerNorm>(blocks["layer_norm1"]);
495495
auto layer_norm2 = std::dynamic_pointer_cast<LayerNorm>(blocks["layer_norm2"]);
496496
auto mlp = std::dynamic_pointer_cast<CLIPMLP>(blocks["mlp"]);
497497

498-
x = ggml_add(ctx, x, self_attn->forward(ctx, backend, layer_norm1->forward(ctx, x), mask));
499-
x = ggml_add(ctx, x, mlp->forward(ctx, layer_norm2->forward(ctx, x)));
498+
x = ggml_add(ctx->ggml_ctx, x, self_attn->forward(ctx, layer_norm1->forward(ctx, x), mask));
499+
x = ggml_add(ctx->ggml_ctx, x, mlp->forward(ctx, layer_norm2->forward(ctx, x)));
500500
return x;
501501
}
502502
};
@@ -517,8 +517,7 @@ struct CLIPEncoder : public GGMLBlock {
517517
}
518518
}
519519

520-
struct ggml_tensor* forward(struct ggml_context* ctx,
521-
ggml_backend_t backend,
520+
struct ggml_tensor* forward(GGMLRunnerContext* ctx,
522521
struct ggml_tensor* x,
523522
int clip_skip = -1,
524523
bool mask = true) {
@@ -536,7 +535,7 @@ struct CLIPEncoder : public GGMLBlock {
536535
}
537536
std::string name = "layers." + std::to_string(i);
538537
auto layer = std::dynamic_pointer_cast<CLIPLayer>(blocks[name]);
539-
x = layer->forward(ctx, backend, x, mask); // [N, n_token, d_model]
538+
x = layer->forward(ctx, x, mask); // [N, n_token, d_model]
540539
// LOG_DEBUG("layer %d", i);
541540
}
542541
return x;
@@ -578,20 +577,20 @@ class CLIPEmbeddings : public GGMLBlock {
578577
return params["token_embedding.weight"];
579578
}
580579

581-
struct ggml_tensor* forward(struct ggml_context* ctx,
580+
struct ggml_tensor* forward(GGMLRunnerContext* ctx,
582581
struct ggml_tensor* input_ids,
583582
struct ggml_tensor* custom_embed_weight) {
584583
// input_ids: [N, n_token]
585584
auto token_embed_weight = params["token_embedding.weight"];
586585
auto position_embed_weight = params["position_embedding.weight"];
587586

588587
GGML_ASSERT(input_ids->ne[0] == position_embed_weight->ne[1]);
589-
input_ids = ggml_reshape_3d(ctx, input_ids, input_ids->ne[0], 1, input_ids->ne[1]);
590-
auto token_embedding = ggml_get_rows(ctx, custom_embed_weight != nullptr ? custom_embed_weight : token_embed_weight, input_ids);
591-
token_embedding = ggml_reshape_3d(ctx, token_embedding, token_embedding->ne[0], token_embedding->ne[1], token_embedding->ne[3]);
588+
input_ids = ggml_reshape_3d(ctx->ggml_ctx, input_ids, input_ids->ne[0], 1, input_ids->ne[1]);
589+
auto token_embedding = ggml_get_rows(ctx->ggml_ctx, custom_embed_weight != nullptr ? custom_embed_weight : token_embed_weight, input_ids);
590+
token_embedding = ggml_reshape_3d(ctx->ggml_ctx, token_embedding, token_embedding->ne[0], token_embedding->ne[1], token_embedding->ne[3]);
592591

593592
// token_embedding + position_embedding
594-
auto x = ggml_add(ctx,
593+
auto x = ggml_add(ctx->ggml_ctx,
595594
token_embedding,
596595
position_embed_weight); // [N, n_token, embed_dim]
597596
return x;
@@ -629,7 +628,7 @@ class CLIPVisionEmbeddings : public GGMLBlock {
629628
num_positions = num_patches + 1;
630629
}
631630

632-
struct ggml_tensor* forward(struct ggml_context* ctx, struct ggml_tensor* pixel_values) {
631+
struct ggml_tensor* forward(GGMLRunnerContext* ctx, struct ggml_tensor* pixel_values) {
633632
// pixel_values: [N, num_channels, image_size, image_size]
634633
// return: [N, num_positions, embed_dim]
635634
GGML_ASSERT(pixel_values->ne[0] == image_size && pixel_values->ne[1] == image_size && pixel_values->ne[2] == num_channels);
@@ -641,18 +640,18 @@ class CLIPVisionEmbeddings : public GGMLBlock {
641640
// concat(patch_embedding, class_embedding) + position_embedding
642641
struct ggml_tensor* patch_embedding;
643642
int64_t N = pixel_values->ne[3];
644-
patch_embedding = ggml_ext_conv_2d(ctx, pixel_values, patch_embed_weight, nullptr, patch_size, patch_size); // [N, embed_dim, image_size // pacht_size, image_size // pacht_size]
645-
patch_embedding = ggml_reshape_3d(ctx, patch_embedding, num_patches, embed_dim, N); // [N, embed_dim, num_patches]
646-
patch_embedding = ggml_cont(ctx, ggml_permute(ctx, patch_embedding, 1, 0, 2, 3)); // [N, num_patches, embed_dim]
647-
patch_embedding = ggml_reshape_4d(ctx, patch_embedding, 1, embed_dim, num_patches, N); // [N, num_patches, embed_dim, 1]
648-
649-
struct ggml_tensor* class_embedding = ggml_new_tensor_2d(ctx, GGML_TYPE_F32, embed_dim, N);
650-
class_embedding = ggml_repeat(ctx, class_embed_weight, class_embedding); // [N, embed_dim]
651-
class_embedding = ggml_reshape_4d(ctx, class_embedding, 1, embed_dim, 1, N); // [N, 1, embed_dim, 1]
652-
653-
struct ggml_tensor* x = ggml_concat(ctx, class_embedding, patch_embedding, 2); // [N, num_positions, embed_dim, 1]
654-
x = ggml_reshape_3d(ctx, x, embed_dim, num_positions, N); // [N, num_positions, embed_dim]
655-
x = ggml_add(ctx, x, position_embed_weight);
643+
patch_embedding = ggml_ext_conv_2d(ctx->ggml_ctx, pixel_values, patch_embed_weight, nullptr, patch_size, patch_size); // [N, embed_dim, image_size // pacht_size, image_size // pacht_size]
644+
patch_embedding = ggml_reshape_3d(ctx->ggml_ctx, patch_embedding, num_patches, embed_dim, N); // [N, embed_dim, num_patches]
645+
patch_embedding = ggml_cont(ctx->ggml_ctx, ggml_permute(ctx->ggml_ctx, patch_embedding, 1, 0, 2, 3)); // [N, num_patches, embed_dim]
646+
patch_embedding = ggml_reshape_4d(ctx->ggml_ctx, patch_embedding, 1, embed_dim, num_patches, N); // [N, num_patches, embed_dim, 1]
647+
648+
struct ggml_tensor* class_embedding = ggml_new_tensor_2d(ctx->ggml_ctx, GGML_TYPE_F32, embed_dim, N);
649+
class_embedding = ggml_repeat(ctx->ggml_ctx, class_embed_weight, class_embedding); // [N, embed_dim]
650+
class_embedding = ggml_reshape_4d(ctx->ggml_ctx, class_embedding, 1, embed_dim, 1, N); // [N, 1, embed_dim, 1]
651+
652+
struct ggml_tensor* x = ggml_concat(ctx->ggml_ctx, class_embedding, patch_embedding, 2); // [N, num_positions, embed_dim, 1]
653+
x = ggml_reshape_3d(ctx->ggml_ctx, x, embed_dim, num_positions, N); // [N, num_positions, embed_dim]
654+
x = ggml_add(ctx->ggml_ctx, x, position_embed_weight);
656655
return x; // [N, num_positions, embed_dim]
657656
}
658657
};
@@ -714,8 +713,7 @@ class CLIPTextModel : public GGMLBlock {
714713
return embeddings->get_token_embed_weight();
715714
}
716715

717-
struct ggml_tensor* forward(struct ggml_context* ctx,
718-
ggml_backend_t backend,
716+
struct ggml_tensor* forward(GGMLRunnerContext* ctx,
719717
struct ggml_tensor* input_ids,
720718
struct ggml_tensor* tkn_embeddings,
721719
size_t max_token_idx = 0,
@@ -727,16 +725,16 @@ class CLIPTextModel : public GGMLBlock {
727725
auto final_layer_norm = std::dynamic_pointer_cast<LayerNorm>(blocks["final_layer_norm"]);
728726

729727
auto x = embeddings->forward(ctx, input_ids, tkn_embeddings); // [N, n_token, hidden_size]
730-
x = encoder->forward(ctx, backend, x, return_pooled ? -1 : clip_skip, true);
728+
x = encoder->forward(ctx, x, return_pooled ? -1 : clip_skip, true);
731729
if (return_pooled || with_final_ln) {
732730
x = final_layer_norm->forward(ctx, x);
733731
}
734732

735733
if (return_pooled) {
736734
auto text_projection = params["text_projection"];
737-
ggml_tensor* pooled = ggml_view_1d(ctx, x, hidden_size, x->nb[1] * max_token_idx);
735+
ggml_tensor* pooled = ggml_view_1d(ctx->ggml_ctx, x, hidden_size, x->nb[1] * max_token_idx);
738736
if (text_projection != nullptr) {
739-
pooled = ggml_ext_linear(ctx, pooled, text_projection, nullptr);
737+
pooled = ggml_ext_linear(ctx->ggml_ctx, pooled, text_projection, nullptr);
740738
} else {
741739
LOG_DEBUG("identity projection");
742740
}
@@ -779,8 +777,7 @@ class CLIPVisionModel : public GGMLBlock {
779777
blocks["post_layernorm"] = std::shared_ptr<GGMLBlock>(new LayerNorm(hidden_size));
780778
}
781779

782-
struct ggml_tensor* forward(struct ggml_context* ctx,
783-
ggml_backend_t backend,
780+
struct ggml_tensor* forward(GGMLRunnerContext* ctx,
784781
struct ggml_tensor* pixel_values,
785782
bool return_pooled = true,
786783
int clip_skip = -1) {
@@ -792,14 +789,14 @@ class CLIPVisionModel : public GGMLBlock {
792789

793790
auto x = embeddings->forward(ctx, pixel_values); // [N, num_positions, embed_dim]
794791
x = pre_layernorm->forward(ctx, x);
795-
x = encoder->forward(ctx, backend, x, clip_skip, false);
792+
x = encoder->forward(ctx, x, clip_skip, false);
796793
// print_ggml_tensor(x, true, "ClipVisionModel x: ");
797794
auto last_hidden_state = x;
798795
x = post_layernorm->forward(ctx, x); // [N, n_token, hidden_size]
799796

800797
GGML_ASSERT(x->ne[3] == 1);
801798
if (return_pooled) {
802-
ggml_tensor* pooled = ggml_cont(ctx, ggml_view_2d(ctx, x, x->ne[0], x->ne[2], x->nb[2], 0));
799+
ggml_tensor* pooled = ggml_cont(ctx->ggml_ctx, ggml_view_2d(ctx->ggml_ctx, x, x->ne[0], x->ne[2], x->nb[2], 0));
803800
return pooled; // [N, hidden_size]
804801
} else {
805802
// return x; // [N, n_token, hidden_size]
@@ -831,12 +828,12 @@ class CLIPProjection : public UnaryBlock {
831828
out_features(out_features),
832829
transpose_weight(transpose_weight) {}
833830

834-
struct ggml_tensor* forward(struct ggml_context* ctx, struct ggml_tensor* x) override {
831+
struct ggml_tensor* forward(GGMLRunnerContext* ctx, struct ggml_tensor* x) override {
835832
struct ggml_tensor* w = params["weight"];
836833
if (transpose_weight) {
837-
w = ggml_cont(ctx, ggml_transpose(ctx, w));
834+
w = ggml_cont(ctx->ggml_ctx, ggml_transpose(ctx->ggml_ctx, w));
838835
}
839-
return ggml_ext_linear(ctx, x, w, nullptr);
836+
return ggml_ext_linear(ctx->ggml_ctx, x, w, nullptr);
840837
}
841838
};
842839

@@ -860,8 +857,7 @@ class CLIPVisionModelProjection : public GGMLBlock {
860857
blocks["visual_projection"] = std::shared_ptr<GGMLBlock>(new CLIPProjection(hidden_size, projection_dim, transpose_proj_w));
861858
}
862859

863-
struct ggml_tensor* forward(struct ggml_context* ctx,
864-
ggml_backend_t backend,
860+
struct ggml_tensor* forward(GGMLRunnerContext* ctx,
865861
struct ggml_tensor* pixel_values,
866862
bool return_pooled = true,
867863
int clip_skip = -1) {
@@ -870,7 +866,7 @@ class CLIPVisionModelProjection : public GGMLBlock {
870866
auto vision_model = std::dynamic_pointer_cast<CLIPVisionModel>(blocks["vision_model"]);
871867
auto visual_projection = std::dynamic_pointer_cast<CLIPProjection>(blocks["visual_projection"]);
872868

873-
auto x = vision_model->forward(ctx, backend, pixel_values, return_pooled, clip_skip); // [N, hidden_size] or [N, n_token, hidden_size]
869+
auto x = vision_model->forward(ctx, pixel_values, return_pooled, clip_skip); // [N, hidden_size] or [N, n_token, hidden_size]
874870

875871
if (return_pooled) {
876872
x = visual_projection->forward(ctx, x); // [N, projection_dim]
@@ -902,8 +898,7 @@ struct CLIPTextModelRunner : public GGMLRunner {
902898
model.get_param_tensors(tensors, prefix);
903899
}
904900

905-
struct ggml_tensor* forward(struct ggml_context* ctx,
906-
ggml_backend_t backend,
901+
struct ggml_tensor* forward(GGMLRunnerContext* ctx,
907902
struct ggml_tensor* input_ids,
908903
struct ggml_tensor* embeddings,
909904
size_t max_token_idx = 0,
@@ -913,10 +908,10 @@ struct CLIPTextModelRunner : public GGMLRunner {
913908
size_t n_token = input_ids->ne[0];
914909
if (input_ids->ne[0] > model.n_token) {
915910
GGML_ASSERT(input_ids->ne[0] % model.n_token == 0);
916-
input_ids = ggml_reshape_2d(ctx, input_ids, model.n_token, input_ids->ne[0] / model.n_token);
911+
input_ids = ggml_reshape_2d(ctx->ggml_ctx, input_ids, model.n_token, input_ids->ne[0] / model.n_token);
917912
}
918913

919-
return model.forward(ctx, backend, input_ids, embeddings, max_token_idx, return_pooled, clip_skip);
914+
return model.forward(ctx, input_ids, embeddings, max_token_idx, return_pooled, clip_skip);
920915
}
921916

922917
struct ggml_cgraph* build_graph(struct ggml_tensor* input_ids,
@@ -943,7 +938,9 @@ struct CLIPTextModelRunner : public GGMLRunner {
943938
embeddings = ggml_concat(compute_ctx, token_embed_weight, custom_embeddings, 1);
944939
}
945940

946-
struct ggml_tensor* hidden_states = forward(compute_ctx, runtime_backend, input_ids, embeddings, max_token_idx, return_pooled, clip_skip);
941+
auto runner_ctx = get_context();
942+
943+
struct ggml_tensor* hidden_states = forward(&runner_ctx, input_ids, embeddings, max_token_idx, return_pooled, clip_skip);
947944

948945
ggml_build_forward_expand(gf, hidden_states);
949946

0 commit comments

Comments
 (0)