@@ -451,16 +451,16 @@ struct CLIPMLP : public GGMLBlock {
451451 }
452452 }
453453
454- struct ggml_tensor * forward (struct ggml_context * ctx, struct ggml_tensor * x) {
454+ struct ggml_tensor * forward (GGMLRunnerContext * ctx, struct ggml_tensor * x) {
455455 // x: [N, n_token, d_model]
456456 auto fc1 = std::dynamic_pointer_cast<Linear>(blocks[" fc1" ]);
457457 auto fc2 = std::dynamic_pointer_cast<Linear>(blocks[" fc2" ]);
458458
459459 x = fc1->forward (ctx, x);
460460 if (use_gelu) {
461- x = ggml_gelu_inplace (ctx, x);
461+ x = ggml_gelu_inplace (ctx-> ggml_ctx , x);
462462 } else {
463- x = ggml_gelu_quick_inplace (ctx, x);
463+ x = ggml_gelu_quick_inplace (ctx-> ggml_ctx , x);
464464 }
465465 x = fc2->forward (ctx, x);
466466 return x;
@@ -488,15 +488,15 @@ struct CLIPLayer : public GGMLBlock {
488488 blocks[" mlp" ] = std::shared_ptr<GGMLBlock>(new CLIPMLP (d_model, intermediate_size));
489489 }
490490
491- struct ggml_tensor * forward (struct ggml_context * ctx, ggml_backend_t backend , struct ggml_tensor * x, bool mask = true ) {
491+ struct ggml_tensor * forward (GGMLRunnerContext * ctx, struct ggml_tensor * x, bool mask = true ) {
492492 // x: [N, n_token, d_model]
493493 auto self_attn = std::dynamic_pointer_cast<MultiheadAttention>(blocks[" self_attn" ]);
494494 auto layer_norm1 = std::dynamic_pointer_cast<LayerNorm>(blocks[" layer_norm1" ]);
495495 auto layer_norm2 = std::dynamic_pointer_cast<LayerNorm>(blocks[" layer_norm2" ]);
496496 auto mlp = std::dynamic_pointer_cast<CLIPMLP>(blocks[" mlp" ]);
497497
498- x = ggml_add (ctx, x, self_attn->forward (ctx, backend , layer_norm1->forward (ctx, x), mask));
499- x = ggml_add (ctx, x, mlp->forward (ctx, layer_norm2->forward (ctx, x)));
498+ x = ggml_add (ctx-> ggml_ctx , x, self_attn->forward (ctx, layer_norm1->forward (ctx, x), mask));
499+ x = ggml_add (ctx-> ggml_ctx , x, mlp->forward (ctx, layer_norm2->forward (ctx, x)));
500500 return x;
501501 }
502502};
@@ -517,8 +517,7 @@ struct CLIPEncoder : public GGMLBlock {
517517 }
518518 }
519519
520- struct ggml_tensor * forward (struct ggml_context * ctx,
521- ggml_backend_t backend,
520+ struct ggml_tensor * forward (GGMLRunnerContext* ctx,
522521 struct ggml_tensor * x,
523522 int clip_skip = -1 ,
524523 bool mask = true ) {
@@ -536,7 +535,7 @@ struct CLIPEncoder : public GGMLBlock {
536535 }
537536 std::string name = " layers." + std::to_string (i);
538537 auto layer = std::dynamic_pointer_cast<CLIPLayer>(blocks[name]);
539- x = layer->forward (ctx, backend, x, mask); // [N, n_token, d_model]
538+ x = layer->forward (ctx, x, mask); // [N, n_token, d_model]
540539 // LOG_DEBUG("layer %d", i);
541540 }
542541 return x;
@@ -578,20 +577,20 @@ class CLIPEmbeddings : public GGMLBlock {
578577 return params[" token_embedding.weight" ];
579578 }
580579
581- struct ggml_tensor * forward (struct ggml_context * ctx,
580+ struct ggml_tensor * forward (GGMLRunnerContext * ctx,
582581 struct ggml_tensor * input_ids,
583582 struct ggml_tensor * custom_embed_weight) {
584583 // input_ids: [N, n_token]
585584 auto token_embed_weight = params[" token_embedding.weight" ];
586585 auto position_embed_weight = params[" position_embedding.weight" ];
587586
588587 GGML_ASSERT (input_ids->ne [0 ] == position_embed_weight->ne [1 ]);
589- input_ids = ggml_reshape_3d (ctx, input_ids, input_ids->ne [0 ], 1 , input_ids->ne [1 ]);
590- auto token_embedding = ggml_get_rows (ctx, custom_embed_weight != nullptr ? custom_embed_weight : token_embed_weight, input_ids);
591- token_embedding = ggml_reshape_3d (ctx, token_embedding, token_embedding->ne [0 ], token_embedding->ne [1 ], token_embedding->ne [3 ]);
588+ input_ids = ggml_reshape_3d (ctx-> ggml_ctx , input_ids, input_ids->ne [0 ], 1 , input_ids->ne [1 ]);
589+ auto token_embedding = ggml_get_rows (ctx-> ggml_ctx , custom_embed_weight != nullptr ? custom_embed_weight : token_embed_weight, input_ids);
590+ token_embedding = ggml_reshape_3d (ctx-> ggml_ctx , token_embedding, token_embedding->ne [0 ], token_embedding->ne [1 ], token_embedding->ne [3 ]);
592591
593592 // token_embedding + position_embedding
594- auto x = ggml_add (ctx,
593+ auto x = ggml_add (ctx-> ggml_ctx ,
595594 token_embedding,
596595 position_embed_weight); // [N, n_token, embed_dim]
597596 return x;
@@ -629,7 +628,7 @@ class CLIPVisionEmbeddings : public GGMLBlock {
629628 num_positions = num_patches + 1 ;
630629 }
631630
632- struct ggml_tensor * forward (struct ggml_context * ctx, struct ggml_tensor * pixel_values) {
631+ struct ggml_tensor * forward (GGMLRunnerContext * ctx, struct ggml_tensor * pixel_values) {
633632 // pixel_values: [N, num_channels, image_size, image_size]
634633 // return: [N, num_positions, embed_dim]
635634 GGML_ASSERT (pixel_values->ne [0 ] == image_size && pixel_values->ne [1 ] == image_size && pixel_values->ne [2 ] == num_channels);
@@ -641,18 +640,18 @@ class CLIPVisionEmbeddings : public GGMLBlock {
641640 // concat(patch_embedding, class_embedding) + position_embedding
642641 struct ggml_tensor * patch_embedding;
643642 int64_t N = pixel_values->ne [3 ];
644- patch_embedding = ggml_ext_conv_2d (ctx, pixel_values, patch_embed_weight, nullptr , patch_size, patch_size); // [N, embed_dim, image_size // pacht_size, image_size // pacht_size]
645- patch_embedding = ggml_reshape_3d (ctx, patch_embedding, num_patches, embed_dim, N); // [N, embed_dim, num_patches]
646- patch_embedding = ggml_cont (ctx, ggml_permute (ctx, patch_embedding, 1 , 0 , 2 , 3 )); // [N, num_patches, embed_dim]
647- patch_embedding = ggml_reshape_4d (ctx, patch_embedding, 1 , embed_dim, num_patches, N); // [N, num_patches, embed_dim, 1]
648-
649- struct ggml_tensor * class_embedding = ggml_new_tensor_2d (ctx, GGML_TYPE_F32, embed_dim, N);
650- class_embedding = ggml_repeat (ctx, class_embed_weight, class_embedding); // [N, embed_dim]
651- class_embedding = ggml_reshape_4d (ctx, class_embedding, 1 , embed_dim, 1 , N); // [N, 1, embed_dim, 1]
652-
653- struct ggml_tensor * x = ggml_concat (ctx, class_embedding, patch_embedding, 2 ); // [N, num_positions, embed_dim, 1]
654- x = ggml_reshape_3d (ctx, x, embed_dim, num_positions, N); // [N, num_positions, embed_dim]
655- x = ggml_add (ctx, x, position_embed_weight);
643+ patch_embedding = ggml_ext_conv_2d (ctx-> ggml_ctx , pixel_values, patch_embed_weight, nullptr , patch_size, patch_size); // [N, embed_dim, image_size // pacht_size, image_size // pacht_size]
644+ patch_embedding = ggml_reshape_3d (ctx-> ggml_ctx , patch_embedding, num_patches, embed_dim, N); // [N, embed_dim, num_patches]
645+ patch_embedding = ggml_cont (ctx-> ggml_ctx , ggml_permute (ctx-> ggml_ctx , patch_embedding, 1 , 0 , 2 , 3 )); // [N, num_patches, embed_dim]
646+ patch_embedding = ggml_reshape_4d (ctx-> ggml_ctx , patch_embedding, 1 , embed_dim, num_patches, N); // [N, num_patches, embed_dim, 1]
647+
648+ struct ggml_tensor * class_embedding = ggml_new_tensor_2d (ctx-> ggml_ctx , GGML_TYPE_F32, embed_dim, N);
649+ class_embedding = ggml_repeat (ctx-> ggml_ctx , class_embed_weight, class_embedding); // [N, embed_dim]
650+ class_embedding = ggml_reshape_4d (ctx-> ggml_ctx , class_embedding, 1 , embed_dim, 1 , N); // [N, 1, embed_dim, 1]
651+
652+ struct ggml_tensor * x = ggml_concat (ctx-> ggml_ctx , class_embedding, patch_embedding, 2 ); // [N, num_positions, embed_dim, 1]
653+ x = ggml_reshape_3d (ctx-> ggml_ctx , x, embed_dim, num_positions, N); // [N, num_positions, embed_dim]
654+ x = ggml_add (ctx-> ggml_ctx , x, position_embed_weight);
656655 return x; // [N, num_positions, embed_dim]
657656 }
658657};
@@ -714,8 +713,7 @@ class CLIPTextModel : public GGMLBlock {
714713 return embeddings->get_token_embed_weight ();
715714 }
716715
717- struct ggml_tensor * forward (struct ggml_context * ctx,
718- ggml_backend_t backend,
716+ struct ggml_tensor * forward (GGMLRunnerContext* ctx,
719717 struct ggml_tensor * input_ids,
720718 struct ggml_tensor * tkn_embeddings,
721719 size_t max_token_idx = 0 ,
@@ -727,16 +725,16 @@ class CLIPTextModel : public GGMLBlock {
727725 auto final_layer_norm = std::dynamic_pointer_cast<LayerNorm>(blocks[" final_layer_norm" ]);
728726
729727 auto x = embeddings->forward (ctx, input_ids, tkn_embeddings); // [N, n_token, hidden_size]
730- x = encoder->forward (ctx, backend, x, return_pooled ? -1 : clip_skip, true );
728+ x = encoder->forward (ctx, x, return_pooled ? -1 : clip_skip, true );
731729 if (return_pooled || with_final_ln) {
732730 x = final_layer_norm->forward (ctx, x);
733731 }
734732
735733 if (return_pooled) {
736734 auto text_projection = params[" text_projection" ];
737- ggml_tensor* pooled = ggml_view_1d (ctx, x, hidden_size, x->nb [1 ] * max_token_idx);
735+ ggml_tensor* pooled = ggml_view_1d (ctx-> ggml_ctx , x, hidden_size, x->nb [1 ] * max_token_idx);
738736 if (text_projection != nullptr ) {
739- pooled = ggml_ext_linear (ctx, pooled, text_projection, nullptr );
737+ pooled = ggml_ext_linear (ctx-> ggml_ctx , pooled, text_projection, nullptr );
740738 } else {
741739 LOG_DEBUG (" identity projection" );
742740 }
@@ -779,8 +777,7 @@ class CLIPVisionModel : public GGMLBlock {
779777 blocks[" post_layernorm" ] = std::shared_ptr<GGMLBlock>(new LayerNorm (hidden_size));
780778 }
781779
782- struct ggml_tensor * forward (struct ggml_context * ctx,
783- ggml_backend_t backend,
780+ struct ggml_tensor * forward (GGMLRunnerContext* ctx,
784781 struct ggml_tensor * pixel_values,
785782 bool return_pooled = true ,
786783 int clip_skip = -1 ) {
@@ -792,14 +789,14 @@ class CLIPVisionModel : public GGMLBlock {
792789
793790 auto x = embeddings->forward (ctx, pixel_values); // [N, num_positions, embed_dim]
794791 x = pre_layernorm->forward (ctx, x);
795- x = encoder->forward (ctx, backend, x, clip_skip, false );
792+ x = encoder->forward (ctx, x, clip_skip, false );
796793 // print_ggml_tensor(x, true, "ClipVisionModel x: ");
797794 auto last_hidden_state = x;
798795 x = post_layernorm->forward (ctx, x); // [N, n_token, hidden_size]
799796
800797 GGML_ASSERT (x->ne [3 ] == 1 );
801798 if (return_pooled) {
802- ggml_tensor* pooled = ggml_cont (ctx, ggml_view_2d (ctx, x, x->ne [0 ], x->ne [2 ], x->nb [2 ], 0 ));
799+ ggml_tensor* pooled = ggml_cont (ctx-> ggml_ctx , ggml_view_2d (ctx-> ggml_ctx , x, x->ne [0 ], x->ne [2 ], x->nb [2 ], 0 ));
803800 return pooled; // [N, hidden_size]
804801 } else {
805802 // return x; // [N, n_token, hidden_size]
@@ -831,12 +828,12 @@ class CLIPProjection : public UnaryBlock {
831828 out_features (out_features),
832829 transpose_weight(transpose_weight) {}
833830
834- struct ggml_tensor * forward (struct ggml_context * ctx, struct ggml_tensor * x) override {
831+ struct ggml_tensor * forward (GGMLRunnerContext * ctx, struct ggml_tensor * x) override {
835832 struct ggml_tensor * w = params[" weight" ];
836833 if (transpose_weight) {
837- w = ggml_cont (ctx, ggml_transpose (ctx, w));
834+ w = ggml_cont (ctx-> ggml_ctx , ggml_transpose (ctx-> ggml_ctx , w));
838835 }
839- return ggml_ext_linear (ctx, x, w, nullptr );
836+ return ggml_ext_linear (ctx-> ggml_ctx , x, w, nullptr );
840837 }
841838};
842839
@@ -860,8 +857,7 @@ class CLIPVisionModelProjection : public GGMLBlock {
860857 blocks[" visual_projection" ] = std::shared_ptr<GGMLBlock>(new CLIPProjection (hidden_size, projection_dim, transpose_proj_w));
861858 }
862859
863- struct ggml_tensor * forward (struct ggml_context * ctx,
864- ggml_backend_t backend,
860+ struct ggml_tensor * forward (GGMLRunnerContext* ctx,
865861 struct ggml_tensor * pixel_values,
866862 bool return_pooled = true ,
867863 int clip_skip = -1 ) {
@@ -870,7 +866,7 @@ class CLIPVisionModelProjection : public GGMLBlock {
870866 auto vision_model = std::dynamic_pointer_cast<CLIPVisionModel>(blocks[" vision_model" ]);
871867 auto visual_projection = std::dynamic_pointer_cast<CLIPProjection>(blocks[" visual_projection" ]);
872868
873- auto x = vision_model->forward (ctx, backend, pixel_values, return_pooled, clip_skip); // [N, hidden_size] or [N, n_token, hidden_size]
869+ auto x = vision_model->forward (ctx, pixel_values, return_pooled, clip_skip); // [N, hidden_size] or [N, n_token, hidden_size]
874870
875871 if (return_pooled) {
876872 x = visual_projection->forward (ctx, x); // [N, projection_dim]
@@ -902,8 +898,7 @@ struct CLIPTextModelRunner : public GGMLRunner {
902898 model.get_param_tensors (tensors, prefix);
903899 }
904900
905- struct ggml_tensor * forward (struct ggml_context * ctx,
906- ggml_backend_t backend,
901+ struct ggml_tensor * forward (GGMLRunnerContext* ctx,
907902 struct ggml_tensor * input_ids,
908903 struct ggml_tensor * embeddings,
909904 size_t max_token_idx = 0 ,
@@ -913,10 +908,10 @@ struct CLIPTextModelRunner : public GGMLRunner {
913908 size_t n_token = input_ids->ne [0 ];
914909 if (input_ids->ne [0 ] > model.n_token ) {
915910 GGML_ASSERT (input_ids->ne [0 ] % model.n_token == 0 );
916- input_ids = ggml_reshape_2d (ctx, input_ids, model.n_token , input_ids->ne [0 ] / model.n_token );
911+ input_ids = ggml_reshape_2d (ctx-> ggml_ctx , input_ids, model.n_token , input_ids->ne [0 ] / model.n_token );
917912 }
918913
919- return model.forward (ctx, backend, input_ids, embeddings, max_token_idx, return_pooled, clip_skip);
914+ return model.forward (ctx, input_ids, embeddings, max_token_idx, return_pooled, clip_skip);
920915 }
921916
922917 struct ggml_cgraph * build_graph (struct ggml_tensor * input_ids,
@@ -943,7 +938,9 @@ struct CLIPTextModelRunner : public GGMLRunner {
943938 embeddings = ggml_concat (compute_ctx, token_embed_weight, custom_embeddings, 1 );
944939 }
945940
946- struct ggml_tensor * hidden_states = forward (compute_ctx, runtime_backend, input_ids, embeddings, max_token_idx, return_pooled, clip_skip);
941+ auto runner_ctx = get_context ();
942+
943+ struct ggml_tensor * hidden_states = forward (&runner_ctx, input_ids, embeddings, max_token_idx, return_pooled, clip_skip);
947944
948945 ggml_build_forward_expand (gf, hidden_states);
949946
0 commit comments