Skip to content

Commit d06546b

Browse files
committed
Start to tweak GPU layers autoloader
Add -dedicated parameter for LLM dedicated GPU
1 parent e16a1af commit d06546b

File tree

1 file changed

+165
-17
lines changed

1 file changed

+165
-17
lines changed

Diff for: koboldcpp.py

+165-17
Original file line numberDiff line numberDiff line change
@@ -599,35 +599,176 @@ def read_gguf_key(keyname,data,maxval):
599599
except Exception as ex:
600600
return None
601601

602-
def autoset_gpu_layers(filepath,ctxsize,gpumem): #shitty algo to determine how many layers to use
602+
def autoset_gpu_layers(filepath,ctxsize,gpumem,quantkv,blasbatchsize,flashattention,mmqmode,lowvram,dedicated): #shitty algo to determine how many layers to use
603603
try:
604604
layerlimit = 0
605605
fsize = os.path.getsize(filepath)
606606
if fsize>10000000: #dont bother with models < 10mb
607607
cs = ctxsize
608-
mem = gpumem
608+
609+
ded = dedicated
610+
611+
if ded == 1:
612+
reserved_mem = 2**29
613+
else:
614+
reserved_mem = (2**30+2**27)
615+
gpu_smem = gpumem / 4
616+
mem = gpu_smem - reserved_mem
617+
618+
bbs = blasbatchsize
619+
bbs_ratio = bbs / 128
620+
621+
fa = flashattention
622+
fa_ratio = 1
623+
if fa == 1:
624+
fa_ratio = 0.5
625+
626+
mmq = mmqmode
627+
mmq_ratio = 1
628+
if mmq == 1:
629+
mmq_ratio = 0.5
630+
631+
lv = lowvram
632+
lvctx_ratio = 1
633+
if lv == 1:
634+
lvctx_ratio = 0
635+
lvcomp_ratio = 1
636+
if lv == 1:
637+
lvcomp_ratio = 0.5
638+
639+
kvq = quantkv
640+
kvbpw = 0
641+
if kvq == 0:
642+
kvbpw = 32
643+
if kvq == 1:
644+
kvbpw = 24.5
645+
if kvq == 2:
646+
kvbpw = 22
647+
if kvq == 3:
648+
kvbpw = 21.5
649+
if kvq == 4:
650+
kvbpw = 21
651+
if kvq == 5:
652+
kvbpw = 20.5
653+
if kvq == 6:
654+
kvbpw = 17
655+
if kvq == 7:
656+
kvbpw = 14.5
657+
if kvq == 8:
658+
kvbpw = 14
659+
if kvq == 9:
660+
kvbpw = 13.5
661+
if kvq == 10:
662+
kvbpw = 13
663+
if kvq == 11:
664+
kvbpw = 12
665+
if kvq == 12:
666+
kvbpw = 11.5
667+
if kvq == 13:
668+
kvbpw = 11
669+
if kvq == 14:
670+
kvbpw = 10.5
671+
if kvq == 15:
672+
kvbpw = 11
673+
if kvq == 16:
674+
kvbpw = 10.5
675+
if kvq == 17:
676+
kvbpw = 10
677+
if kvq == 18:
678+
kvbpw = 10
679+
if kvq == 19:
680+
kvbpw = 9.5
681+
if kvq == 20:
682+
kvbpw = 9
683+
if kvq == 21:
684+
kvbpw = 32
685+
if kvq == 22:
686+
kvbpw = 24.5
687+
if kvq == 23:
688+
kvbpw = 22
689+
if kvq == 24:
690+
kvbpw = 21.5
691+
if kvq == 25:
692+
kvbpw = 21
693+
if kvq == 26:
694+
kvbpw = 20.5
695+
609696
csmul = 1.0
610-
if cs and cs > 8192:
611-
csmul = 1.4
697+
if cs and cs > 131072:
698+
csmul = 1.35
699+
elif cs and cs > 65536:
700+
csmul = 1.30
701+
elif cs and cs > 32768:
702+
csmul = 1.25
703+
elif cs and cs > 16384:
704+
csmul = 1.20
705+
elif cs and cs > 8192:
706+
csmul = 1.15
612707
elif cs and cs > 4096:
613-
csmul = 1.2
708+
csmul = 1.10
614709
elif cs and cs > 2048:
615-
csmul = 1.1
616-
if mem < fsize*1.6*csmul:
710+
csmul = 1.05
711+
712+
layer_offset = 0
713+
714+
print("***")
715+
print(f"Model size: {fsize} B ; GPU VRAM: {gpumem} B ; GPU simulated VRAM: {gpu_smem} B")
716+
print(f"Reserved VRAM {reserved_mem} B ; GPU usable VRAM {mem} B")
717+
print(f"Blas batch size: {bbs} ; BBS ratio: {bbs_ratio}")
718+
print(f"Flash Attention: {fa} ; FA ratio: {fa_ratio}")
719+
print(f"MMQ: {mmq} ; MMQ ratio: {mmq_ratio}")
720+
print(f"Lowvram: {lv} ; Lowvram context ratio: {lvctx_ratio} ; Lowvram compute ratio: {lvcomp_ratio}")
721+
print(f"Quant KV mode: {kvq} ; Quant KV bpw: {kvbpw}")
722+
print(f"Context size: {cs} ; Context compute buffer multiplier (CCBM): {csmul}")
723+
print(f"Manual layer offset: {layer_offset}")
724+
print("***")
725+
if mem < fsize*1.1*csmul:
726+
print(f" GPU usable VRAM : {mem} B < {fsize} B * 1.1 * {csmul} (CCBM) ")
617727
ggufmeta = read_gguf_metadata(filepath)
618728
if not ggufmeta or ggufmeta[0]==0: #fail to read or no layers
729+
print(f"Failure to read metadata or no layers number declared. Fallback calculation.")
619730
sizeperlayer = fsize*csmul*0.052
731+
print(f"Size per layer = Model size {fsize} B x 0.052 x {csmul} (CCBM)")
620732
layerlimit = int(min(200,mem/sizeperlayer))
733+
print(f"Size per layer: {sizeperlayer} B ; layers limit: {layerlimit}")
621734
else:
735+
print(f"Success to read metadata, proceeding...")
622736
layers = ggufmeta[0]
623737
headcount = ggufmeta[1]
624738
headkvlen = (ggufmeta[2] if ggufmeta[2] > 0 else 128)
625-
ratio = mem/(fsize*csmul*1.5)
739+
sizeperlayer = fsize/layers
740+
print(f"Model layers: {layers} ; Attention heads: {headcount} ; Head size : {headkvlen} ; Size per layer: {sizeperlayer} B")
741+
ratio_init = mem/(fsize*csmul*1.5)
742+
print(f"Initial ratio: {ratio_init} = GPU usable VRAM {mem} B / Model size: {fsize} B x 1.2 x {csmul} (CCBM)")
626743
if headcount > 0:
627-
ratio = max(ratio,mem/(fsize*1.34 + (layers*headcount*headkvlen*cs*4.25)))
628-
layerlimit = int(ratio*layers)
744+
print("***")
745+
print(f"Attention heads: {headcount} > 0")
746+
# ratio = max(ratio_init,mem/(fsize*1.34 + (layers*headcount*headkvlen*cs*4.25))) #concedo
747+
# ratio = max(ratio_init,mem/(fsize*1.025 + (layers*headcount*headkvlen*cs*4) + (layers*4*headkvlen*cs*4) + (1.5*1024*1024*1024))) #Henky
748+
# ratio = min(ratio_init,mem/(fsize*1.04 + (layers*(headcount+(bbs_ratio*mmq_ratio*fa_ratio))*headkvlen*cs*kvbpw/8))) #Nexes based on Pyroserenus
749+
loaded_layers = (layers*ratio_init+layer_offset)
750+
loaded_layers_size = loaded_layers * sizeperlayer
751+
print(f"Initially loaded layers: {loaded_layers} ; Size per layer: {sizeperlayer} B ; Loaded layer size {loaded_layers_size} B")
752+
print(f"context size: {cs} tokens ; GPU usable VRAM: {mem} B ; quant_kv_bpw : {kvbpw} bpw")
753+
context_buffer = (layers*headcount*headkvlen*cs*lvctx_ratio*kvbpw/8)
754+
compute_buffer = (layers*bbs_ratio*mmq_ratio*fa_ratio*headkvlen*cs*lvcomp_ratio*4)
755+
total_buffer = context_buffer + compute_buffer
756+
loaded_size = int(fsize*1.1 + context_buffer)
757+
ratio_formula = (mem - compute_buffer)/loaded_size
758+
print(f"Context buffer: {context_buffer} B + Compute buffer: {compute_buffer} B = Total_buffer: {total_buffer} B")
759+
print(f"Loaded size: {loaded_size} B ; Formula ratio: {ratio_formula}")
760+
ratio = max(ratio_init,ratio_formula)
761+
else: ratio = ratio_init
762+
layerlimit = int(ratio*layers+layer_offset)
763+
print(f"Layers limit: {layerlimit} = final ratio {ratio} x {layers} layers")
764+
estimated_loaded_size = int(layerlimit*sizeperlayer + total_buffer)
765+
estimated_occupation_size = int(estimated_loaded_size + reserved_mem)
766+
print(f"Estimated loaded size: {estimated_loaded_size} B ; Estimated loaded size: {estimated_occupation_size} B")
629767
else:
768+
print(f"Best case : assume full offload.")
630769
layerlimit = 200 # assume full offload
770+
print(f"GPU VRAM {mem} B is superior to Model size {fsize} B x 1.1 x {csmul} (CCBM)")
771+
print("***")
631772
return layerlimit
632773
except Exception as ex:
633774
return 0
@@ -2193,6 +2334,7 @@ def hide_tooltip(event):
21932334
gpu_choice_var = ctk.StringVar(value="1")
21942335

21952336
launchbrowser = ctk.IntVar(value=1)
2337+
dedicated = ctk.IntVar(value=0)
21962338
highpriority = ctk.IntVar()
21972339
disablemmap = ctk.IntVar()
21982340
usemlock = ctk.IntVar()
@@ -2489,6 +2631,7 @@ def changerunmode(a,b,c):
24892631
tensor_split_label.grid_forget()
24902632
tensor_split_entry.grid_forget()
24912633
splitmode_box.grid_forget()
2634+
dedicated_box.grid_forget()
24922635

24932636
if index == "Use Vulkan":
24942637
tensor_split_label.grid(row=8, column=0, padx = 8, pady=1, stick="nw")
@@ -2534,11 +2677,13 @@ def changerunmode(a,b,c):
25342677
# quick boxes
25352678
quick_boxes = {
25362679
"Launch Browser": [launchbrowser, "Launches your default browser after model loading is complete"],
2537-
"Disable MMAP": [disablemmap, "Avoids using mmap to load models if enabled"],
2680+
"Disable MMAP": [disablemmap, "Avoids using mmap to load models if enabled"],
25382681
"Use ContextShift": [contextshift, "Uses Context Shifting to reduce reprocessing.\nRecommended. Check the wiki for more info."],
2539-
"Remote Tunnel": [remotetunnel, "Creates a trycloudflare tunnel.\nAllows you to access koboldcpp from other devices over an internet URL."],
2682+
"Use SmartContext": [smartcontext, "Use Smart Context. Now considered outdated and not recommended, except for KVQ with FA.\nCheck the wiki for more info."],
2683+
"Remote Tunnel": [remotetunnel, "Creates a trycloudflare tunnel.\nAllows you to access koboldcpp from other devices over an internet URL."],
25402684
"Use FlashAttention": [flashattention, "Enable flash attention for GGUF models."],
2541-
"Quiet Mode": [quietmode, "Prevents all generation related terminal output from being displayed."]
2685+
"Quiet Mode": [quietmode, "Prevents all generation related terminal output from being displayed."],
2686+
"High Priority - disabled by default": [highpriority, "Increases the koboldcpp process priority.\nMay cause lag or slowdown instead. Not recommended."]
25422687
}
25432688

25442689
for idx, (name, properties) in enumerate(quick_boxes.items()):
@@ -2585,6 +2730,7 @@ def changerunmode(a,b,c):
25852730
lowvram_box = makecheckbox(hardware_tab, "Low VRAM (No KV offload)", lowvram_var, 4,0, tooltiptxt='Avoid offloading KV Cache or scratch buffers to VRAM.\nAllows more layers to fit, but may result in a speed loss.')
25862731
mmq_box = makecheckbox(hardware_tab, "Use QuantMatMul (mmq)", mmq_var, 4,1, tooltiptxt="Enable MMQ mode to use finetuned kernels instead of default CuBLAS/HipBLAS for prompt processing.\nRead the wiki. Speed may vary.")
25872732
splitmode_box = makecheckbox(hardware_tab, "Row-Split", rowsplit_var, 5,0, tooltiptxt="Split rows across GPUs instead of splitting layers and KV across GPUs.\nUses the main GPU for small tensors and intermediate results. Speed may vary.")
2733+
dedicated_box = makecheckbox(hardware_tab, "Dedicated GPU for LLM", rowsplit_var, 5,1, tooltiptxt="Reduces the reserved area of the GPU layers autoloader from 1.125GB to 0.5GB.")
25882734

25892735
# threads
25902736
makelabelentry(hardware_tab, "Threads:" , threads_var, 11, 50,tooltip="How many threads to use.\nRecommended value is your CPU core count, defaults are usually OK.")
@@ -2595,6 +2741,7 @@ def changerunmode(a,b,c):
25952741
"High Priority": [highpriority, "Increases the koboldcpp process priority.\nMay cause lag or slowdown instead. Not recommended."],
25962742
"Disable MMAP": [disablemmap, "Avoids using mmap to load models if enabled"],
25972743
"Use mlock": [usemlock, "Enables mlock, preventing the RAM used to load the model from being paged out."],
2744+
# "Direct I/O": [usedirect_io, "Enables Direct_IO, accelerating the model loading time],
25982745
"Debug Mode": [debugmode, "Enables debug mode, with extra info printed to the terminal."],
25992746
"Keep Foreground": [keepforeground, "Bring KoboldCpp to the foreground every time there is a new generation."]
26002747
}
@@ -2789,6 +2936,7 @@ def export_vars():
27892936
# args.token_healing = token_healing.get()
27902937
args.debugmode = debugmode.get()
27912938
args.launch = launchbrowser.get()==1
2939+
args.dedicated = dedicated.get()==1
27922940
args.highpriority = highpriority.get()==1
27932941
args.nommap = disablemmap.get()==1
27942942
args.smartcontext = smartcontext.get()==1
@@ -2927,6 +3075,7 @@ def import_vars(dict):
29273075
if "debugmode" in dict:
29283076
debugmode.set(dict["debugmode"])
29293077
launchbrowser.set(1 if "launch" in dict and dict["launch"] else 0)
3078+
dedicated.set(1 if "dedicated" in dict and dict["dedicated"] else 0)
29303079
highpriority.set(1 if "highpriority" in dict and dict["highpriority"] else 0)
29313080
disablemmap.set(1 if "nommap" in dict and dict["nommap"] else 0)
29323081
smartcontext.set(1 if "smartcontext" in dict and dict["smartcontext"] else 0)
@@ -3816,7 +3965,7 @@ def main(launch_args,start_server=True):
38163965
fetch_gpu_properties(False,True,True)
38173966
pass
38183967
if MaxMemory[0] > 0:
3819-
layeramt = autoset_gpu_layers(args.model_param, args.contextsize, MaxMemory[0])
3968+
layeramt = autoset_gpu_layers(args.model_param, args.contextsize, MaxMemory[0], args.quantkv, args.blasbatchsize, args.flashattention, "mmq" in args.usecublas, "lowvram" in args.usecublas, args.dedicated)
38203969
print(f"Auto Recommended Layers: {layeramt}")
38213970
args.gpulayers = layeramt
38223971

@@ -4147,11 +4296,10 @@ def range_checker(arg: str):
41474296
return f
41484297
return range_checker
41494298

4150-
print(f"***\nWelcome to KoboldCpp Frankenstein Fork - Version {KcppVersion}") # just update version manually
4299+
print(f"***\nWelcome to Kobold.Cpp Frankenstein Fork - Version {KcppVersion}") # just update version manually
41514300
print(f"***\nBased on LlamaCpp - Version {LcppVersion}") # just update LlamaCPP version manually
41524301
print(f"***\nRelease date: {ReleaseDate}") # just update date manually
41534302
print(f"***\nCuda mode compiled, if any: {CudaSpecifics}") # just update Cuda options used in CMake manually
4154-
41554303
print("***")
41564304
# print("Python version: " + sys.version)
41574305
parser = argparse.ArgumentParser(description='KoboldCpp Server')
@@ -4164,7 +4312,7 @@ def range_checker(arg: str):
41644312
parser.add_argument("--host", metavar=('[ipaddr]'), help="Host IP to listen on. If empty, all routable interfaces are accepted.", default="")
41654313
parser.add_argument("--launch", help="Launches a web browser when load is completed.", action='store_true')
41664314
parser.add_argument("--config", metavar=('[filename]'), help="Load settings from a .kcpps file. Other arguments will be ignored", type=str, nargs=1)
4167-
4315+
parser.add_argument("--dedicated", help="Reduces the reserved area of the GPU layers autoloader from 1.125GB to 0.5GB.", action='store_true')
41684316
parser.add_argument("--threads", metavar=('[threads]'), help="Use a custom number of threads if specified. Otherwise, uses an amount based on CPU cores", type=int, default=get_default_threads())
41694317
compatgroup = parser.add_mutually_exclusive_group()
41704318
compatgroup.add_argument("--usecublas", help="Use CuBLAS for GPU Acceleration (NVIDIA Geforce RTX cards, GTX 1xxx and 9xx are also compatible to some extend). Requires CUDA. Select lowvram to not allocate the context KV cache in VRAM, but instead in RAM. Enter a number afterwards to select and use 1 GPU. Leaving no number will use all GPUs. For hipBLAS binaries, please check YellowRoseCx rocm fork.", nargs='*',metavar=('[lowvram|normal] [main GPU ID] [mmq] [rowsplit]'), choices=['normal', 'lowvram', '0', '1', '2', '3', 'mmq', 'rowsplit'])

0 commit comments

Comments
 (0)