@@ -476,11 +476,12 @@ struct CLIPLayer : public GGMLBlock {
476476public:
477477 CLIPLayer (int64_t d_model,
478478 int64_t n_head,
479- int64_t intermediate_size)
479+ int64_t intermediate_size,
480+ bool proj_in = false )
480481 : d_model(d_model),
481482 n_head (n_head),
482483 intermediate_size(intermediate_size) {
483- blocks[" self_attn" ] = std::shared_ptr<GGMLBlock>(new MultiheadAttention (d_model, n_head, true , true ));
484+ blocks[" self_attn" ] = std::shared_ptr<GGMLBlock>(new MultiheadAttention (d_model, n_head, true , true , proj_in ));
484485
485486 blocks[" layer_norm1" ] = std::shared_ptr<GGMLBlock>(new LayerNorm (d_model));
486487 blocks[" layer_norm2" ] = std::shared_ptr<GGMLBlock>(new LayerNorm (d_model));
@@ -509,11 +510,12 @@ struct CLIPEncoder : public GGMLBlock {
509510 CLIPEncoder (int64_t n_layer,
510511 int64_t d_model,
511512 int64_t n_head,
512- int64_t intermediate_size)
513+ int64_t intermediate_size,
514+ bool proj_in = false )
513515 : n_layer(n_layer) {
514516 for (int i = 0 ; i < n_layer; i++) {
515517 std::string name = " layers." + std::to_string (i);
516- blocks[name] = std::shared_ptr<GGMLBlock>(new CLIPLayer (d_model, n_head, intermediate_size));
518+ blocks[name] = std::shared_ptr<GGMLBlock>(new CLIPLayer (d_model, n_head, intermediate_size, proj_in ));
517519 }
518520 }
519521
@@ -549,10 +551,10 @@ class CLIPEmbeddings : public GGMLBlock {
549551 int64_t num_positions;
550552 bool force_clip_f32;
551553
552- void init_params (struct ggml_context * ctx, const String2GGMLType& tensor_types = {}, const std::string prefix = " " ) override {
554+ void init_params (struct ggml_context * ctx, const String2TensorStorage& tensor_storage_map = {}, const std::string prefix = " " ) override {
553555 enum ggml_type token_wtype = GGML_TYPE_F32;
554556 if (!force_clip_f32) {
555- token_wtype = get_type (prefix + " token_embedding.weight" , tensor_types , GGML_TYPE_F32);
557+ token_wtype = get_type (prefix + " token_embedding.weight" , tensor_storage_map , GGML_TYPE_F32);
556558 if (!support_get_rows (token_wtype)) {
557559 token_wtype = GGML_TYPE_F32;
558560 }
@@ -605,7 +607,8 @@ class CLIPVisionEmbeddings : public GGMLBlock {
605607 int64_t image_size;
606608 int64_t num_patches;
607609 int64_t num_positions;
608- void init_params (struct ggml_context * ctx, const String2GGMLType& tensor_types = {}, const std::string prefix = " " ) override {
610+
611+ void init_params (struct ggml_context * ctx, const String2TensorStorage& tensor_storage_map = {}, const std::string prefix = " " ) override {
609612 enum ggml_type patch_wtype = GGML_TYPE_F16;
610613 enum ggml_type class_wtype = GGML_TYPE_F32;
611614 enum ggml_type position_wtype = GGML_TYPE_F32;
@@ -668,7 +671,7 @@ enum CLIPVersion {
668671
669672class CLIPTextModel : public GGMLBlock {
670673protected:
671- void init_params (struct ggml_context * ctx, const String2GGMLType& tensor_types = {}, const std::string prefix = " " ) override {
674+ void init_params (struct ggml_context * ctx, const String2TensorStorage& tensor_storage_map = {}, const std::string prefix = " " ) override {
672675 if (version == OPEN_CLIP_VIT_BIGG_14) {
673676 enum ggml_type wtype = GGML_TYPE_F32;
674677 params[" text_projection" ] = ggml_new_tensor_2d (ctx, wtype, projection_dim, hidden_size);
@@ -689,7 +692,8 @@ class CLIPTextModel : public GGMLBlock {
689692
690693 CLIPTextModel (CLIPVersion version = OPENAI_CLIP_VIT_L_14,
691694 bool with_final_ln = true ,
692- bool force_clip_f32 = false )
695+ bool force_clip_f32 = false ,
696+ bool proj_in = false )
693697 : version(version), with_final_ln(with_final_ln) {
694698 if (version == OPEN_CLIP_VIT_H_14) {
695699 hidden_size = 1024 ;
@@ -704,7 +708,7 @@ class CLIPTextModel : public GGMLBlock {
704708 }
705709
706710 blocks[" embeddings" ] = std::shared_ptr<GGMLBlock>(new CLIPEmbeddings (hidden_size, vocab_size, n_token, force_clip_f32));
707- blocks[" encoder" ] = std::shared_ptr<GGMLBlock>(new CLIPEncoder (n_layer, hidden_size, n_head, intermediate_size));
711+ blocks[" encoder" ] = std::shared_ptr<GGMLBlock>(new CLIPEncoder (n_layer, hidden_size, n_head, intermediate_size, proj_in ));
708712 blocks[" final_layer_norm" ] = std::shared_ptr<GGMLBlock>(new LayerNorm (hidden_size));
709713 }
710714
@@ -758,7 +762,7 @@ class CLIPVisionModel : public GGMLBlock {
758762 int32_t n_layer = 24 ;
759763
760764public:
761- CLIPVisionModel (CLIPVersion version = OPENAI_CLIP_VIT_L_14) {
765+ CLIPVisionModel (CLIPVersion version = OPENAI_CLIP_VIT_L_14, bool proj_in = false ) {
762766 if (version == OPEN_CLIP_VIT_H_14) {
763767 hidden_size = 1280 ;
764768 intermediate_size = 5120 ;
@@ -773,7 +777,7 @@ class CLIPVisionModel : public GGMLBlock {
773777
774778 blocks[" embeddings" ] = std::shared_ptr<GGMLBlock>(new CLIPVisionEmbeddings (hidden_size, num_channels, patch_size, image_size));
775779 blocks[" pre_layernorm" ] = std::shared_ptr<GGMLBlock>(new LayerNorm (hidden_size));
776- blocks[" encoder" ] = std::shared_ptr<GGMLBlock>(new CLIPEncoder (n_layer, hidden_size, n_head, intermediate_size));
780+ blocks[" encoder" ] = std::shared_ptr<GGMLBlock>(new CLIPEncoder (n_layer, hidden_size, n_head, intermediate_size, proj_in ));
777781 blocks[" post_layernorm" ] = std::shared_ptr<GGMLBlock>(new LayerNorm (hidden_size));
778782 }
779783
@@ -811,8 +815,8 @@ class CLIPProjection : public UnaryBlock {
811815 int64_t out_features;
812816 bool transpose_weight;
813817
814- void init_params (struct ggml_context * ctx, const String2GGMLType& tensor_types = {}, const std::string prefix = " " ) override {
815- enum ggml_type wtype = get_type (prefix + " weight" , tensor_types , GGML_TYPE_F32);
818+ void init_params (struct ggml_context * ctx, const String2TensorStorage& tensor_storage_map = {}, const std::string prefix = " " ) override {
819+ enum ggml_type wtype = get_type (prefix + " weight" , tensor_storage_map , GGML_TYPE_F32);
816820 if (transpose_weight) {
817821 params[" weight" ] = ggml_new_tensor_2d (ctx, wtype, out_features, in_features);
818822 } else {
@@ -845,15 +849,16 @@ class CLIPVisionModelProjection : public GGMLBlock {
845849
846850public:
847851 CLIPVisionModelProjection (CLIPVersion version = OPENAI_CLIP_VIT_L_14,
848- bool transpose_proj_w = false ) {
852+ bool transpose_proj_w = false ,
853+ bool proj_in = false ) {
849854 if (version == OPEN_CLIP_VIT_H_14) {
850855 hidden_size = 1280 ;
851856 projection_dim = 1024 ;
852857 } else if (version == OPEN_CLIP_VIT_BIGG_14) {
853858 hidden_size = 1664 ;
854859 }
855860
856- blocks[" vision_model" ] = std::shared_ptr<GGMLBlock>(new CLIPVisionModel (version));
861+ blocks[" vision_model" ] = std::shared_ptr<GGMLBlock>(new CLIPVisionModel (version, proj_in ));
857862 blocks[" visual_projection" ] = std::shared_ptr<GGMLBlock>(new CLIPProjection (hidden_size, projection_dim, transpose_proj_w));
858863 }
859864
@@ -881,13 +886,24 @@ struct CLIPTextModelRunner : public GGMLRunner {
881886
882887 CLIPTextModelRunner (ggml_backend_t backend,
883888 bool offload_params_to_cpu,
884- const String2GGMLType& tensor_types ,
889+ const String2TensorStorage& tensor_storage_map ,
885890 const std::string prefix,
886891 CLIPVersion version = OPENAI_CLIP_VIT_L_14,
887892 bool with_final_ln = true ,
888893 bool force_clip_f32 = false )
889- : GGMLRunner(backend, offload_params_to_cpu), model(version, with_final_ln, force_clip_f32) {
890- model.init (params_ctx, tensor_types, prefix);
894+ : GGMLRunner(backend, offload_params_to_cpu) {
895+ bool proj_in = false ;
896+ for (const auto & [name, tensor_storage] : tensor_storage_map) {
897+ if (!starts_with (name, prefix)) {
898+ continue ;
899+ }
900+ if (contains (name, " self_attn.in_proj" )) {
901+ proj_in = true ;
902+ break ;
903+ }
904+ }
905+ model = CLIPTextModel (version, with_final_ln, force_clip_f32, proj_in);
906+ model.init (params_ctx, tensor_storage_map, prefix);
891907 }
892908
893909 std::string get_desc () override {
0 commit comments