@@ -266,8 +266,13 @@ struct LoraModel : public GGMLRunner {
266
266
float scale_value = 1 .0f ;
267
267
std::string fk = lora_pre[type] + key;
268
268
if (lora_tensors.find (fk + " .hada_w1_a" ) != lora_tensors.end ()) {
269
- // loHa mode
269
+ // LoHa mode
270
270
271
+ // TODO: split qkv convention for LoHas (is it ever used?)
272
+ if (is_qkv_split || is_qkvm_split) {
273
+ LOG_ERROR (" Split qkv isn't supported for LoHa models." );
274
+ break ;
275
+ }
271
276
std::string alpha_name = " " ;
272
277
273
278
ggml_tensor* hada_1_mid = NULL ; // tau for tucker decomposition
@@ -286,11 +291,6 @@ struct LoraModel : public GGMLRunner {
286
291
std::string hada_2_down_name = " " ;
287
292
std::string hada_2_up_name = " " ;
288
293
289
- // TODO: split qkv convention for LoHas (is it ever used?)
290
- if (is_qkv_split || is_qkvm_split) {
291
- LOG_ERROR (" Split qkv isn't supported for LoHa models." );
292
- break ;
293
- }
294
294
295
295
hada_1_down_name = fk + " .hada_w1_b" ;
296
296
hada_1_up_name = fk + " .hada_w1_a" ;
@@ -340,12 +340,20 @@ struct LoraModel : public GGMLRunner {
340
340
341
341
// calc_scale
342
342
// TODO: .dora_scale?
343
- int64_t dim = hada_1_down->ne [ggml_n_dims (hada_1_down) - 1 ];
343
+ int64_t rank = hada_1_down->ne [ggml_n_dims (hada_1_down) - 1 ];
344
344
if (lora_tensors.find (alpha_name) != lora_tensors.end ()) {
345
345
float alpha = ggml_backend_tensor_get_f32 (lora_tensors[alpha_name]);
346
- scale_value = alpha / dim ;
346
+ scale_value = alpha / rank ;
347
347
}
348
348
} else if (lora_tensors.find (fk + " .lokr_w1" ) != lora_tensors.end () || lora_tensors.find (fk + " .lokr_w1_a" ) != lora_tensors.end ()) {
349
+ // LoKr mode
350
+
351
+ // TODO: split qkv convention for LoKrs (is it ever used?)
352
+ if (is_qkv_split || is_qkvm_split) {
353
+ LOG_ERROR (" Split qkv isn't supported for LoKr models." );
354
+ break ;
355
+ }
356
+
349
357
std::string alpha_name = fk + " .alpha" ;
350
358
351
359
ggml_tensor* lokr_w1 = NULL ;
@@ -354,12 +362,6 @@ struct LoraModel : public GGMLRunner {
354
362
std::string lokr_w1_name = " " ;
355
363
std::string lokr_w2_name = " " ;
356
364
357
- // TODO: split qkv convention for LoKrs (is it ever used?)
358
- if (is_qkv_split || is_qkvm_split) {
359
- LOG_ERROR (" Split qkv isn't supported for LoKr models." );
360
- break ;
361
- }
362
-
363
365
lokr_w1_name = fk + " .lokr_w1" ;
364
366
lokr_w2_name = fk + " .lokr_w2" ;
365
367
@@ -372,14 +374,14 @@ struct LoraModel : public GGMLRunner {
372
374
std::string down_name = lokr_w1_name + " _b" ;
373
375
std::string up_name = lokr_w1_name + " _a" ;
374
376
if (lora_tensors.find (down_name) != lora_tensors.end ()) {
377
+ // w1 should not be low rank normally, sometimes w1 and w2 are swapped
375
378
down = to_f32 (compute_ctx, lora_tensors[down_name]);
376
379
applied_lora_tensors.insert (down_name);
377
380
378
- // scale != 1 only when using Low rank form (?)
379
- int64_t dim = down->ne [ggml_n_dims (down) - 1 ];
381
+ int64_t rank = down->ne [ggml_n_dims (down) - 1 ];
380
382
if (lora_tensors.find (alpha_name) != lora_tensors.end ()) {
381
383
float alpha = ggml_backend_tensor_get_f32 (lora_tensors[alpha_name]);
382
- scale_value = alpha / dim ;
384
+ scale_value = alpha / rank ;
383
385
}
384
386
}
385
387
if (lora_tensors.find (up_name) != lora_tensors.end ()) {
@@ -399,18 +401,25 @@ struct LoraModel : public GGMLRunner {
399
401
if (lora_tensors.find (down_name) != lora_tensors.end ()) {
400
402
down = to_f32 (compute_ctx, lora_tensors[down_name]);
401
403
applied_lora_tensors.insert (down_name);
404
+
405
+ int64_t rank = down->ne [ggml_n_dims (down) - 1 ];
406
+ if (lora_tensors.find (alpha_name) != lora_tensors.end ()) {
407
+ float alpha = ggml_backend_tensor_get_f32 (lora_tensors[alpha_name]);
408
+ scale_value = alpha / rank;
409
+ }
402
410
}
403
411
if (lora_tensors.find (up_name) != lora_tensors.end ()) {
404
412
up = to_f32 (compute_ctx, lora_tensors[up_name]);
405
413
applied_lora_tensors.insert (up_name);
406
414
}
407
415
lokr_w2 = ggml_merge_lora (compute_ctx, down, up);
408
416
}
417
+
418
+ // Technically it might be unused, but I believe it's the expected behavior
419
+ applied_lora_tensors.insert (alpha_name);
409
420
410
421
updown = ggml_kronecker (compute_ctx, lokr_w1, lokr_w2);
411
422
412
- // TODO: double check alpha implementation, it seems strange to not use them most of the time
413
- applied_lora_tensors.insert (alpha_name);
414
423
} else {
415
424
// LoRA mode
416
425
ggml_tensor* lora_mid = NULL ; // tau for tucker decomposition
@@ -770,12 +779,12 @@ struct LoraModel : public GGMLRunner {
770
779
}
771
780
// calc_scale
772
781
// TODO: .dora_scale?
773
- int64_t dim = lora_down->ne [ggml_n_dims (lora_down) - 1 ];
782
+ int64_t rank = lora_down->ne [ggml_n_dims (lora_down) - 1 ];
774
783
if (lora_tensors.find (scale_name) != lora_tensors.end ()) {
775
784
scale_value = ggml_backend_tensor_get_f32 (lora_tensors[scale_name]);
776
785
} else if (lora_tensors.find (alpha_name) != lora_tensors.end ()) {
777
786
float alpha = ggml_backend_tensor_get_f32 (lora_tensors[alpha_name]);
778
- scale_value = alpha / dim ;
787
+ scale_value = alpha / rank ;
779
788
}
780
789
781
790
updown = ggml_merge_lora (compute_ctx, lora_down, lora_up, lora_mid);
0 commit comments