You signed in with another tab or window. Reload to refresh your session.You signed out in another tab or window. Reload to refresh your session.You switched accounts on another tab or window. Reload to refresh your session.Dismiss alert
defautoset_gpu_layers(filepath,ctxsize,gpumem): #shitty algo to determine how many layers to use
602
+
defautoset_gpu_layers(filepath,ctxsize,gpumem,quantkv,blasbatchsize,flashattention,mmqmode,lowvram,dedicated): #shitty algo to determine how many layers to use
603
603
try:
604
604
layerlimit=0
605
605
fsize=os.path.getsize(filepath)
606
606
iffsize>10000000: #dont bother with models < 10mb
607
607
cs=ctxsize
608
-
mem=gpumem
608
+
609
+
ded=dedicated
610
+
611
+
ifded==1:
612
+
reserved_mem=2**29
613
+
else:
614
+
reserved_mem= (2**30+2**27)
615
+
gpu_smem=gpumem/4
616
+
mem=gpu_smem-reserved_mem
617
+
618
+
bbs=blasbatchsize
619
+
bbs_ratio=bbs/128
620
+
621
+
fa=flashattention
622
+
fa_ratio=1
623
+
iffa==1:
624
+
fa_ratio=0.5
625
+
626
+
mmq=mmqmode
627
+
mmq_ratio=1
628
+
ifmmq==1:
629
+
mmq_ratio=0.5
630
+
631
+
lv=lowvram
632
+
lvctx_ratio=1
633
+
iflv==1:
634
+
lvctx_ratio=0
635
+
lvcomp_ratio=1
636
+
iflv==1:
637
+
lvcomp_ratio=0.5
638
+
639
+
kvq=quantkv
640
+
kvbpw=0
641
+
ifkvq==0:
642
+
kvbpw=32
643
+
ifkvq==1:
644
+
kvbpw=24.5
645
+
ifkvq==2:
646
+
kvbpw=22
647
+
ifkvq==3:
648
+
kvbpw=21.5
649
+
ifkvq==4:
650
+
kvbpw=21
651
+
ifkvq==5:
652
+
kvbpw=20.5
653
+
ifkvq==6:
654
+
kvbpw=17
655
+
ifkvq==7:
656
+
kvbpw=14.5
657
+
ifkvq==8:
658
+
kvbpw=14
659
+
ifkvq==9:
660
+
kvbpw=13.5
661
+
ifkvq==10:
662
+
kvbpw=13
663
+
ifkvq==11:
664
+
kvbpw=12
665
+
ifkvq==12:
666
+
kvbpw=11.5
667
+
ifkvq==13:
668
+
kvbpw=11
669
+
ifkvq==14:
670
+
kvbpw=10.5
671
+
ifkvq==15:
672
+
kvbpw=11
673
+
ifkvq==16:
674
+
kvbpw=10.5
675
+
ifkvq==17:
676
+
kvbpw=10
677
+
ifkvq==18:
678
+
kvbpw=10
679
+
ifkvq==19:
680
+
kvbpw=9.5
681
+
ifkvq==20:
682
+
kvbpw=9
683
+
ifkvq==21:
684
+
kvbpw=32
685
+
ifkvq==22:
686
+
kvbpw=24.5
687
+
ifkvq==23:
688
+
kvbpw=22
689
+
ifkvq==24:
690
+
kvbpw=21.5
691
+
ifkvq==25:
692
+
kvbpw=21
693
+
ifkvq==26:
694
+
kvbpw=20.5
695
+
609
696
csmul=1.0
610
-
ifcsandcs>8192:
611
-
csmul=1.4
697
+
ifcsandcs>131072:
698
+
csmul=1.35
699
+
elifcsandcs>65536:
700
+
csmul=1.30
701
+
elifcsandcs>32768:
702
+
csmul=1.25
703
+
elifcsandcs>16384:
704
+
csmul=1.20
705
+
elifcsandcs>8192:
706
+
csmul=1.15
612
707
elifcsandcs>4096:
613
-
csmul=1.2
708
+
csmul=1.10
614
709
elifcsandcs>2048:
615
-
csmul=1.1
616
-
ifmem<fsize*1.6*csmul:
710
+
csmul=1.05
711
+
712
+
layer_offset=0
713
+
714
+
print("***")
715
+
print(f"Model size: {fsize} B ; GPU VRAM: {gpumem} B ; GPU simulated VRAM: {gpu_smem} B")
716
+
print(f"Reserved VRAM {reserved_mem} B ; GPU usable VRAM {mem} B")
"Launch Browser": [launchbrowser, "Launches your default browser after model loading is complete"],
2537
-
"Disable MMAP": [disablemmap, "Avoids using mmap to load models if enabled"],
2680
+
"Disable MMAP": [disablemmap, "Avoids using mmap to load models if enabled"],
2538
2681
"Use ContextShift": [contextshift, "Uses Context Shifting to reduce reprocessing.\nRecommended. Check the wiki for more info."],
2539
-
"Remote Tunnel": [remotetunnel, "Creates a trycloudflare tunnel.\nAllows you to access koboldcpp from other devices over an internet URL."],
2682
+
"Use SmartContext": [smartcontext, "Use Smart Context. Now considered outdated and not recommended, except for KVQ with FA.\nCheck the wiki for more info."],
2683
+
"Remote Tunnel": [remotetunnel, "Creates a trycloudflare tunnel.\nAllows you to access koboldcpp from other devices over an internet URL."],
2540
2684
"Use FlashAttention": [flashattention, "Enable flash attention for GGUF models."],
2541
-
"Quiet Mode": [quietmode, "Prevents all generation related terminal output from being displayed."]
2685
+
"Quiet Mode": [quietmode, "Prevents all generation related terminal output from being displayed."],
2686
+
"High Priority - disabled by default": [highpriority, "Increases the koboldcpp process priority.\nMay cause lag or slowdown instead. Not recommended."]
lowvram_box=makecheckbox(hardware_tab, "Low VRAM (No KV offload)", lowvram_var, 4,0, tooltiptxt='Avoid offloading KV Cache or scratch buffers to VRAM.\nAllows more layers to fit, but may result in a speed loss.')
2586
2731
mmq_box=makecheckbox(hardware_tab, "Use QuantMatMul (mmq)", mmq_var, 4,1, tooltiptxt="Enable MMQ mode to use finetuned kernels instead of default CuBLAS/HipBLAS for prompt processing.\nRead the wiki. Speed may vary.")
2587
2732
splitmode_box=makecheckbox(hardware_tab, "Row-Split", rowsplit_var, 5,0, tooltiptxt="Split rows across GPUs instead of splitting layers and KV across GPUs.\nUses the main GPU for small tensors and intermediate results. Speed may vary.")
2733
+
dedicated_box=makecheckbox(hardware_tab, "Dedicated GPU for LLM", rowsplit_var, 5,1, tooltiptxt="Reduces the reserved area of the GPU layers autoloader from 1.125GB to 0.5GB.")
2588
2734
2589
2735
# threads
2590
2736
makelabelentry(hardware_tab, "Threads:" , threads_var, 11, 50,tooltip="How many threads to use.\nRecommended value is your CPU core count, defaults are usually OK.")
@@ -2595,6 +2741,7 @@ def changerunmode(a,b,c):
2595
2741
"High Priority": [highpriority, "Increases the koboldcpp process priority.\nMay cause lag or slowdown instead. Not recommended."],
2596
2742
"Disable MMAP": [disablemmap, "Avoids using mmap to load models if enabled"],
2597
2743
"Use mlock": [usemlock, "Enables mlock, preventing the RAM used to load the model from being paged out."],
2744
+
# "Direct I/O": [usedirect_io, "Enables Direct_IO, accelerating the model loading time],
2598
2745
"Debug Mode": [debugmode, "Enables debug mode, with extra info printed to the terminal."],
2599
2746
"Keep Foreground": [keepforeground, "Bring KoboldCpp to the foreground every time there is a new generation."]
parser.add_argument("--host", metavar=('[ipaddr]'), help="Host IP to listen on. If empty, all routable interfaces are accepted.", default="")
4165
4313
parser.add_argument("--launch", help="Launches a web browser when load is completed.", action='store_true')
4166
4314
parser.add_argument("--config", metavar=('[filename]'), help="Load settings from a .kcpps file. Other arguments will be ignored", type=str, nargs=1)
4167
-
4315
+
parser.add_argument("--dedicated", help="Reduces the reserved area of the GPU layers autoloader from 1.125GB to 0.5GB.", action='store_true')
4168
4316
parser.add_argument("--threads", metavar=('[threads]'), help="Use a custom number of threads if specified. Otherwise, uses an amount based on CPU cores", type=int, default=get_default_threads())
4169
4317
compatgroup=parser.add_mutually_exclusive_group()
4170
4318
compatgroup.add_argument("--usecublas", help="Use CuBLAS for GPU Acceleration (NVIDIA Geforce RTX cards, GTX 1xxx and 9xx are also compatible to some extend). Requires CUDA. Select lowvram to not allocate the context KV cache in VRAM, but instead in RAM. Enter a number afterwards to select and use 1 GPU. Leaving no number will use all GPUs. For hipBLAS binaries, please check YellowRoseCx rocm fork.", nargs='*',metavar=('[lowvram|normal] [main GPU ID] [mmq] [rowsplit]'), choices=['normal', 'lowvram', '0', '1', '2', '3', 'mmq', 'rowsplit'])
0 commit comments