Reapply "bump to 0.0.16"

turboderp · turboderp · commit 7b721bfb8af5 · 2024-03-20T07:55:57.000+01:00
This reverts commit 36cce99.
diff --git a/examples/streaming.py b/examples/streaming.py
@@ -15,14 +15,24 @@
 )
 
 import time
+import torch
 
 # Initialize model and cache
 
-model_directory = "/mnt/str/models/mistral-7b-instruct-exl2/4.0bpw/"
+# model_directory = "/mnt/str/models/mistral-7b-instruct-exl2/4.0bpw/"
+# model_directory = "/mnt/str/models/mistral-7b-instruct"
+# model_directory = "/mnt/str/models/starcoder2-7b"
+model_directory = "/mnt/str/models/command-r-exl2/6.0bpw"
+# model_directory = "/mnt/str/models/command-r"
+
+torch.set_printoptions(precision = 5, sci_mode = False)
 
 config = ExLlamaV2Config()
 config.model_dir = model_directory
 config.prepare()
+# config.load_in_q4 = True
+config.max_seq_len = 300
+config.no_flash_attn = True
 
 model = ExLlamaV2(config)
 print("Loading model: " + model_directory)
@@ -39,25 +49,25 @@
 # Settings
 
 settings = ExLlamaV2Sampler.Settings()
-settings.temperature = 0.85
-settings.top_k = 50
+settings.temperature = 1.0
+settings.top_k = 0
 settings.top_p = 0.8
 settings.top_a = 0.0
-settings.token_repetition_penalty = 1.05
+settings.token_repetition_penalty = 1.02
 settings.disallow_tokens(tokenizer, [tokenizer.eos_token_id])
 
 max_new_tokens = 250
 
 # Prompt
 
-prompt = "Our story begins in the Scottish town of Auchtermuchty, where once"
+prompt = "Once upon a time,"
 
 input_ids = tokenizer.encode(prompt, add_bos = True)
 prompt_tokens = input_ids.shape[-1]
 
 # Make sure CUDA is initialized so we can measure performance
 
-generator.warmup()
+# generator.warmup()
 
 # Send prompt to generator to begin stream
 
diff --git a/exllamav2/version.py b/exllamav2/version.py
@@ -1 +1 @@
-__version__ = "0.0.15"
+__version__ = "0.0.16"

Original file line number	Diff line number	Diff line change
`@@ -1 +1 @@`
`1`		`-__version__ = "0.0.15"`
	`1`	`+__version__ = "0.0.16"`