|
15 | 15 | )
|
16 | 16 |
|
17 | 17 | import time
|
18 |
| -import torch |
19 | 18 |
|
20 | 19 | # Initialize model and cache
|
21 | 20 |
|
22 |
| -# model_directory = "/mnt/str/models/mistral-7b-instruct-exl2/4.0bpw/" |
23 |
| -# model_directory = "/mnt/str/models/mistral-7b-instruct" |
24 |
| -# model_directory = "/mnt/str/models/starcoder2-7b" |
25 |
| -model_directory = "/mnt/str/models/command-r-exl2/6.0bpw" |
26 |
| -# model_directory = "/mnt/str/models/command-r" |
27 |
| - |
28 |
| -torch.set_printoptions(precision = 5, sci_mode = False) |
| 21 | +model_directory = "/mnt/str/models/mistral-7b-instruct-exl2/4.0bpw/" |
29 | 22 |
|
30 | 23 | config = ExLlamaV2Config()
|
31 | 24 | config.model_dir = model_directory
|
32 | 25 | config.prepare()
|
33 |
| -# config.load_in_q4 = True |
34 |
| -config.max_seq_len = 300 |
35 |
| -config.no_flash_attn = True |
36 | 26 |
|
37 | 27 | model = ExLlamaV2(config)
|
38 | 28 | print("Loading model: " + model_directory)
|
|
49 | 39 | # Settings
|
50 | 40 |
|
51 | 41 | settings = ExLlamaV2Sampler.Settings()
|
52 |
| -settings.temperature = 1.0 |
53 |
| -settings.top_k = 0 |
| 42 | +settings.temperature = 0.85 |
| 43 | +settings.top_k = 50 |
54 | 44 | settings.top_p = 0.8
|
55 | 45 | settings.top_a = 0.0
|
56 |
| -settings.token_repetition_penalty = 1.02 |
| 46 | +settings.token_repetition_penalty = 1.05 |
57 | 47 | settings.disallow_tokens(tokenizer, [tokenizer.eos_token_id])
|
58 | 48 |
|
59 | 49 | max_new_tokens = 250
|
60 | 50 |
|
61 | 51 | # Prompt
|
62 | 52 |
|
63 |
| -prompt = "Once upon a time," |
| 53 | +prompt = "Our story begins in the Scottish town of Auchtermuchty, where once" |
64 | 54 |
|
65 | 55 | input_ids = tokenizer.encode(prompt, add_bos = True)
|
66 | 56 | prompt_tokens = input_ids.shape[-1]
|
67 | 57 |
|
68 | 58 | # Make sure CUDA is initialized so we can measure performance
|
69 | 59 |
|
70 |
| -# generator.warmup() |
| 60 | +generator.warmup() |
71 | 61 |
|
72 | 62 | # Send prompt to generator to begin stream
|
73 | 63 |
|
|
0 commit comments