|
15 | 15 | )
|
16 | 16 |
|
17 | 17 | import time
|
| 18 | +import torch |
18 | 19 |
|
19 | 20 | # Initialize model and cache
|
20 | 21 |
|
21 |
| -model_directory = "/mnt/str/models/mistral-7b-instruct-exl2/4.0bpw/" |
| 22 | +# model_directory = "/mnt/str/models/mistral-7b-instruct-exl2/4.0bpw/" |
| 23 | +# model_directory = "/mnt/str/models/mistral-7b-instruct" |
| 24 | +# model_directory = "/mnt/str/models/starcoder2-7b" |
| 25 | +model_directory = "/mnt/str/models/command-r-exl2/6.0bpw" |
| 26 | +# model_directory = "/mnt/str/models/command-r" |
| 27 | + |
| 28 | +torch.set_printoptions(precision = 5, sci_mode = False) |
22 | 29 |
|
23 | 30 | config = ExLlamaV2Config()
|
24 | 31 | config.model_dir = model_directory
|
25 | 32 | config.prepare()
|
| 33 | +# config.load_in_q4 = True |
| 34 | +config.max_seq_len = 300 |
| 35 | +config.no_flash_attn = True |
26 | 36 |
|
27 | 37 | model = ExLlamaV2(config)
|
28 | 38 | print("Loading model: " + model_directory)
|
|
39 | 49 | # Settings
|
40 | 50 |
|
41 | 51 | settings = ExLlamaV2Sampler.Settings()
|
42 |
| -settings.temperature = 0.85 |
43 |
| -settings.top_k = 50 |
| 52 | +settings.temperature = 1.0 |
| 53 | +settings.top_k = 0 |
44 | 54 | settings.top_p = 0.8
|
45 | 55 | settings.top_a = 0.0
|
46 |
| -settings.token_repetition_penalty = 1.05 |
| 56 | +settings.token_repetition_penalty = 1.02 |
47 | 57 | settings.disallow_tokens(tokenizer, [tokenizer.eos_token_id])
|
48 | 58 |
|
49 | 59 | max_new_tokens = 250
|
50 | 60 |
|
51 | 61 | # Prompt
|
52 | 62 |
|
53 |
| -prompt = "Our story begins in the Scottish town of Auchtermuchty, where once" |
| 63 | +prompt = "Once upon a time," |
54 | 64 |
|
55 | 65 | input_ids = tokenizer.encode(prompt, add_bos = True)
|
56 | 66 | prompt_tokens = input_ids.shape[-1]
|
57 | 67 |
|
58 | 68 | # Make sure CUDA is initialized so we can measure performance
|
59 | 69 |
|
60 |
| -generator.warmup() |
| 70 | +# generator.warmup() |
61 | 71 |
|
62 | 72 | # Send prompt to generator to begin stream
|
63 | 73 |
|
|
0 commit comments