1+ #include " imatrix.hpp"
2+
3+ /* Stolen from llama.cpp (credits: Kawrakow)*/
4+
5+ #include " ggml-backend.h"
6+ #include " ggml.h"
7+ #include " util.h"
8+
9+ #include < fstream>
10+ #include < mutex>
11+ #include < unordered_map>
12+ #include < string>
13+
14+ // remove any prefix and suffixes from the name
15+ // CUDA0#blk.0.attn_k.weight#0 => blk.0.attn_k.weight
16+ static std::string filter_tensor_name (const char * name) {
17+ std::string wname;
18+ const char * p = strchr (name, ' #' );
19+ if (p != NULL ) {
20+ p = p + 1 ;
21+ const char * q = strchr (p, ' #' );
22+ if (q != NULL ) {
23+ wname = std::string (p, q - p);
24+ } else {
25+ wname = p;
26+ }
27+ } else {
28+ wname = name;
29+ }
30+ return wname;
31+ }
32+
33+ bool IMatrixCollector::collect_imatrix (struct ggml_tensor * t, bool ask, void * user_data) {
34+ GGML_UNUSED (user_data);
35+ const struct ggml_tensor * src0 = t->src [0 ];
36+ const struct ggml_tensor * src1 = t->src [1 ];
37+ std::string wname = filter_tensor_name (src0->name );
38+
39+ // when ask is true, the scheduler wants to know if we are interested in data from this tensor
40+ // if we return true, a follow-up call will be made with ask=false in which we can do the actual collection
41+ if (ask) {
42+ if (t->op == GGML_OP_MUL_MAT_ID)
43+ return true ; // collect all indirect matrix multiplications
44+ if (t->op != GGML_OP_MUL_MAT)
45+ return false ;
46+ // why are small batches ignored (<16 tokens)?
47+ // if (src1->ne[1] < 16 || src1->type != GGML_TYPE_F32) return false;
48+ if (!(wname.substr (0 , 6 ) == " model." || wname.substr (0 , 17 ) == " cond_stage_model." || wname.substr (0 ,14 ) == " text_encoders." ))
49+ return false ;
50+ return true ;
51+ }
52+ // LOG_DEBUG("%s", wname.c_str());
53+
54+ std::lock_guard<std::mutex> lock (m_mutex);
55+
56+ // copy the data from the GPU memory if needed
57+ const bool is_host = ggml_backend_buffer_is_host (src1->buffer );
58+
59+ if (!is_host) {
60+ m_src1_data.resize (ggml_nelements (src1));
61+ ggml_backend_tensor_get (src1, m_src1_data.data (), 0 , ggml_nbytes (src1));
62+ }
63+
64+ const float * data = is_host ? (const float *)src1->data : m_src1_data.data ();
65+
66+ // this has been adapted to the new format of storing merged experts in a single 3d tensor
67+ // ref: https://github.com/ggml-org/llama.cpp/pull/6387
68+ if (t->op == GGML_OP_MUL_MAT_ID) {
69+ // ids -> [n_experts_used, n_tokens]
70+ // src1 -> [cols, n_expert_used, n_tokens]
71+ const ggml_tensor* ids = t->src [2 ];
72+ const int n_as = src0->ne [2 ];
73+ const int n_ids = ids->ne [0 ];
74+
75+ // the top-k selected expert ids are stored in the ids tensor
76+ // for simplicity, always copy ids to host, because it is small
77+ // take into account that ids is not contiguous!
78+
79+ GGML_ASSERT (ids->ne [1 ] == src1->ne [2 ]);
80+
81+ m_ids.resize (ggml_nbytes (ids));
82+ ggml_backend_tensor_get (ids, m_ids.data (), 0 , ggml_nbytes (ids));
83+
84+ auto & e = m_stats[wname];
85+
86+ ++e.ncall ;
87+
88+ if (e.values .empty ()) {
89+ e.values .resize (src1->ne [0 ] * n_as, 0 );
90+ e.counts .resize (src1->ne [0 ] * n_as, 0 );
91+ } else if (e.values .size () != (size_t )src1->ne [0 ] * n_as) {
92+ LOG_ERROR (" inconsistent size for %s (%d vs %d)\n " , wname.c_str (), (int )e.values .size (), (int )src1->ne [0 ] * n_as);
93+ exit (1 ); // GGML_ABORT("fatal error");
94+ }
95+ // LOG_DEBUG("%s[%d]: %32s, %s, %5d x %5d, %d\n", m_last_call, wname.c_str(), ggml_op_name(t->op), (int)src1->ne[0], (int)src1->ne[2], (int)src1->type);
96+ // loop over all possible experts, regardless if they are used or not in the batch
97+ for (int ex = 0 ; ex < n_as; ++ex) {
98+ size_t e_start = ex * src1->ne [0 ];
99+
100+ for (int idx = 0 ; idx < n_ids; ++idx) {
101+ for (int row = 0 ; row < (int )src1->ne [2 ]; ++row) {
102+ const int excur = *(const int32_t *)(m_ids.data () + row * ids->nb [1 ] + idx * ids->nb [0 ]);
103+
104+ GGML_ASSERT (excur >= 0 && excur < n_as); // sanity check
105+
106+ if (excur != ex)
107+ continue ;
108+
109+ const int64_t i11 = idx % src1->ne [1 ];
110+ const int64_t i12 = row;
111+ const float * x = (const float *)((const char *)data + i11 * src1->nb [1 ] + i12 * src1->nb [2 ]);
112+
113+ for (int j = 0 ; j < (int )src1->ne [0 ]; ++j) {
114+ e.values [e_start + j] += x[j] * x[j];
115+ e.counts [e_start + j]++;
116+ if (!std::isfinite (e.values [e_start + j])) {
117+ printf (" \n " );
118+ LOG_ERROR (" %f detected in %s\n " , e.values [e_start + j], wname.c_str ());
119+ exit (1 );
120+ }
121+ }
122+ }
123+ }
124+ }
125+ } else {
126+ auto & e = m_stats[wname];
127+ if (e.values .empty ()) {
128+ e.values .resize (src1->ne [0 ], 0 );
129+ e.counts .resize (src1->ne [0 ], 0 );
130+ } else if (e.values .size () != (size_t )src1->ne [0 ]) {
131+ LOG_WARN (" inconsistent size for %s (%d vs %d)\n " , wname.c_str (), (int )e.values .size (), (int )src1->ne [0 ]);
132+ exit (1 ); // GGML_ABORT("fatal error");
133+ }
134+
135+ ++e.ncall ;
136+ // LOG_DEBUG("%s[%d]: %32s, %s, %5d x %5d, %d\n", m_last_call, wname.c_str(), ggml_op_name(t->op), (int)src1->ne[0], (int)src1->ne[1], (int)src1->type);
137+ for (int row = 0 ; row < (int )src1->ne [1 ]; ++row) {
138+ const float * x = data + row * src1->ne [0 ];
139+ for (int j = 0 ; j < (int )src1->ne [0 ]; ++j) {
140+ e.values [j] += x[j] * x[j];
141+ e.counts [j]++;
142+ if (!std::isfinite (e.values [j])) {
143+ LOG_WARN (" %f detected in %s\n " , e.values [j], wname.c_str ());
144+ exit (1 );
145+ }
146+ }
147+ }
148+ }
149+ return true ;
150+
151+ }
152+
153+ void IMatrixCollector::save_imatrix (std::string fname,int ncall) const {
154+ LOG_INFO (" SAVING_IMATRIX to %s\n " , fname.c_str ());
155+
156+ if (ncall > 0 ) {
157+ fname += " .at_" ;
158+ fname += std::to_string (ncall);
159+ }
160+ // avoid writing imatrix entries that do not have full data
161+ // this can happen with MoE models where some of the experts end up not being exercised by the provided training data
162+
163+ int n_entries = 0 ;
164+ std::vector<std::string> to_store;
165+
166+ bool is_first = true ; // for printing
167+ for (const auto & kv : m_stats) {
168+ const int n_all = kv.second .counts .size ();
169+
170+ if (n_all == 0 ) {
171+ continue ;
172+ }
173+
174+ int n_zeros = 0 ;
175+ for (const int c : kv.second .counts ) {
176+ if (c == 0 ) {
177+ n_zeros++;
178+ }
179+ }
180+
181+ if (n_zeros != 0 && is_first) {
182+ printf (" \n " );
183+ is_first = false ;
184+ }
185+
186+ if (n_zeros == n_all) {
187+ LOG_WARN (" entry '%40s' has no data - skipping\n " , kv.first .c_str ());
188+ continue ;
189+ }
190+
191+ if (n_zeros > 0 ) {
192+ LOG_WARN (" entry '%40s' has partial data (%.2f%%) - skipping\n " , kv.first .c_str (), 100 .0f * (n_all - n_zeros) / n_all);
193+ continue ;
194+ }
195+
196+ n_entries++;
197+ to_store.push_back (kv.first );
198+ }
199+
200+ if (to_store.size () < m_stats.size ()) {
201+ LOG_WARN (" storing only %zu out of %zu entries\n " , to_store.size (), m_stats.size ());
202+ }
203+
204+ std::ofstream out (fname, std::ios::binary);
205+ out.write ((const char *)&n_entries, sizeof (n_entries));
206+ for (const auto & name : to_store) {
207+ const auto & stat = m_stats.at (name);
208+ int len = name.size ();
209+ out.write ((const char *)&len, sizeof (len));
210+ out.write (name.c_str (), len);
211+ out.write ((const char *)&stat.ncall , sizeof (stat.ncall ));
212+ int nval = stat.values .size ();
213+ out.write ((const char *)&nval, sizeof (nval));
214+ if (nval > 0 ) {
215+ std::vector<float > tmp (nval);
216+ for (int i = 0 ; i < nval; i++) {
217+ tmp[i] = (stat.values [i] / static_cast <float >(stat.counts [i])) * static_cast <float >(stat.ncall );
218+ }
219+ out.write ((const char *)tmp.data (), nval * sizeof (float ));
220+ }
221+ }
222+
223+ // Write the number of call the matrix was computed with
224+ out.write ((const char *)&m_last_call, sizeof (m_last_call));
225+
226+ // LOG_DEBUG("\n");
227+ // LOG_DEBUG("stored collected data after %d chunks in %s\n", m_last_call, fname.c_str());
228+ }
229+
230+ bool IMatrixCollector::load_imatrix (const char * fname) {
231+ std::ifstream in (fname, std::ios::binary);
232+ if (!in) {
233+ LOG_ERROR (" failed to open %s\n " , fname);
234+ return false ;
235+ }
236+ int n_entries;
237+ in.read ((char *)&n_entries, sizeof (n_entries));
238+ if (in.fail () || n_entries < 1 ) {
239+ LOG_ERROR (" no data in file %s\n " , fname);
240+ return false ;
241+ }
242+ for (int i = 0 ; i < n_entries; ++i) {
243+ int len;
244+ in.read ((char *)&len, sizeof (len));
245+ std::vector<char > name_as_vec (len + 1 );
246+ in.read ((char *)name_as_vec.data (), len);
247+ if (in.fail ()) {
248+ LOG_ERROR (" failed reading name for entry %d from %s\n " , i + 1 , fname);
249+ return false ;
250+ }
251+ name_as_vec[len] = 0 ;
252+ std::string name{name_as_vec.data ()};
253+ auto & e = m_stats[std::move (name)];
254+ int ncall;
255+ in.read ((char *)&ncall, sizeof (ncall));
256+ int nval;
257+ in.read ((char *)&nval, sizeof (nval));
258+ if (in.fail () || nval < 1 ) {
259+ LOG_ERROR (" failed reading number of values for entry %d\n " , i);
260+ m_stats = {};
261+ return false ;
262+ }
263+
264+ if (e.values .empty ()) {
265+ e.values .resize (nval, 0 );
266+ e.counts .resize (nval, 0 );
267+ }
268+
269+ std::vector<float > tmp (nval);
270+ in.read ((char *)tmp.data (), nval * sizeof (float ));
271+ if (in.fail ()) {
272+ LOG_ERROR (" failed reading data for entry %d\n " , i);
273+ m_stats = {};
274+ return false ;
275+ }
276+
277+ // Recreate the state as expected by save_imatrix(), and corerct for weighted sum.
278+ for (int i = 0 ; i < nval; i++) {
279+ e.values [i] += tmp[i];
280+ e.counts [i] += ncall;
281+ }
282+ e.ncall += ncall;
283+ }
284+ return true ;
285+ }
0 commit comments