File tree 2 files changed +19
-16
lines changed
2 files changed +19
-16
lines changed Original file line number Diff line number Diff line change @@ -51,20 +51,21 @@ def from_pretrained(self, path_to_model):
51
51
52
52
model = ExLlamaV2 (config )
53
53
54
- if shared .args .cache_8bit :
55
- cache = ExLlamaV2Cache_8bit (model , lazy = True )
56
- else :
57
- cache = ExLlamaV2Cache (model , lazy = True )
58
-
59
- if shared .args .autosplit :
60
- model .load_autosplit (cache )
61
- else :
54
+ if not shared .args .autosplit :
62
55
split = None
63
56
if shared .args .gpu_split :
64
57
split = [float (alloc ) for alloc in shared .args .gpu_split .split ("," )]
65
58
66
59
model .load (split )
67
60
61
+ if shared .args .cache_8bit :
62
+ cache = ExLlamaV2Cache_8bit (model , lazy = shared .args .autosplit )
63
+ else :
64
+ cache = ExLlamaV2Cache (model , lazy = shared .args .autosplit )
65
+
66
+ if shared .args .autosplit :
67
+ model .load_autosplit (cache )
68
+
68
69
tokenizer = ExLlamaV2Tokenizer (config )
69
70
generator = ExLlamaV2StreamingGenerator (model , cache , tokenizer )
70
71
Original file line number Diff line number Diff line change @@ -36,24 +36,26 @@ class Exllamav2HF(PreTrainedModel):
36
36
def __init__ (self , config : ExLlamaV2Config ):
37
37
super ().__init__ (PretrainedConfig ())
38
38
self .ex_config = config
39
- self .ex_model = ExLlamaV2 (config )
40
39
self .loras = None
41
40
self .generation_config = GenerationConfig ()
42
41
43
- if shared .args .cache_8bit :
44
- self .ex_cache = ExLlamaV2Cache_8bit (self .ex_model , lazy = True )
45
- else :
46
- self .ex_cache = ExLlamaV2Cache (self .ex_model , lazy = True )
42
+ self .ex_model = ExLlamaV2 (config )
47
43
48
- if shared .args .autosplit :
49
- self .ex_model .load_autosplit (self .ex_cache )
50
- else :
44
+ if not shared .args .autosplit :
51
45
split = None
52
46
if shared .args .gpu_split :
53
47
split = [float (alloc ) for alloc in shared .args .gpu_split .split ("," )]
54
48
55
49
self .ex_model .load (split )
56
50
51
+ if shared .args .cache_8bit :
52
+ self .ex_cache = ExLlamaV2Cache_8bit (self .ex_model , lazy = shared .args .autosplit )
53
+ else :
54
+ self .ex_cache = ExLlamaV2Cache (self .ex_model , lazy = shared .args .autosplit )
55
+
56
+ if shared .args .autosplit :
57
+ self .ex_model .load_autosplit (self .ex_cache )
58
+
57
59
self .past_seq = None
58
60
if shared .args .cfg_cache :
59
61
if shared .args .cache_8bit :
You can’t perform that action at this time.
0 commit comments