1
+ import sys , os
2
+ sys .path .append (os .path .dirname (os .path .dirname (os .path .abspath (__file__ ))))
3
+
4
+ from exllamav2 import (
5
+ ExLlamaV2 ,
6
+ ExLlamaV2Config ,
7
+ ExLlamaV2Cache ,
8
+ ExLlamaV2Tokenizer ,
9
+ ExLlamaV2VisionTower ,
10
+ )
11
+
12
+ from exllamav2 .generator import (
13
+ ExLlamaV2DynamicGenerator ,
14
+ ExLlamaV2DynamicJob ,
15
+ ExLlamaV2Sampler ,
16
+ )
17
+
18
+ from PIL import Image
19
+ import requests
20
+
21
+ # Model used:
22
+ #
23
+ # Quantized: https://huggingface.co/turboderp/pixtral-12b-exl2
24
+ # Unquantized: https://huggingface.co/mistral-community/pixtral-12b/
25
+
26
+ model_directory = "/mnt/str/models/pixtral-12b-exl2/6.0bpw"
27
+ config = ExLlamaV2Config (model_directory )
28
+ config .max_seq_len = 16384 # default is 1M
29
+
30
+ # Load vision model and multimodal projector and initialize preprocessor
31
+
32
+ vision_model = ExLlamaV2VisionTower (config )
33
+ vision_model .load (progress = True )
34
+
35
+ # Load EXL2 model
36
+
37
+ model = ExLlamaV2 (config )
38
+ cache = ExLlamaV2Cache (model , lazy = True , max_seq_len = 16384 )
39
+ model .load_autosplit (cache , progress = True )
40
+ tokenizer = ExLlamaV2Tokenizer (config )
41
+
42
+ # Create generator
43
+
44
+ generator = ExLlamaV2DynamicGenerator (
45
+ model = model ,
46
+ cache = cache ,
47
+ tokenizer = tokenizer
48
+ )
49
+
50
+ # Util function to get a PIL image from a URL or from a file in the script's directory
51
+
52
+ def get_image (file = None , url = None ):
53
+ assert (file or url ) and not (file and url )
54
+ if file :
55
+ script_dir = os .path .dirname (os .path .abspath (__file__ ))
56
+ file_path = os .path .join (script_dir , file )
57
+ return Image .open (file_path )
58
+ elif url :
59
+ return Image .open (requests .get (url , stream = True ).raw )
60
+
61
+ # Convert image(s) to embeddings
62
+
63
+ image_embeddings = [
64
+ vision_model .get_image_embeddings (
65
+ model = model ,
66
+ tokenizer = tokenizer ,
67
+ image = img ,
68
+ text_alias = alias ,
69
+ )
70
+ for (alias , img ) in [
71
+ ("{{IMAGE_1}}" , get_image (file = "test_image_1.jpg" )),
72
+ ("{{IMAGE_2}}" , get_image (file = "test_image_2.jpg" )),
73
+ # ("{{IMAGE_3}}", get_image(url = "https://encrypted-tbn0.gstatic.com/images?q=tbn:ANd9GcRSERy82bn3jpYKr1cNxMLXTyEsVvSt2wZOIQ&s")),
74
+ ]
75
+ ]
76
+
77
+ # Define a prompt using the aliases above as placeholders for image tokens. The tokenizer will replace each alias
78
+ # with a range of temporary token IDs, and the model will embed those temporary IDs from their respective sources
79
+ # rather than the model's text embedding table.
80
+ #
81
+ # The temporary IDs are unique for the lifetime of the process and persist as long as a reference is held to the
82
+ # corresponding ExLlamaV2Embedding object. This way, images can be reused between generations, or used multiple
83
+ # for multiple jobs in a batch, and the generator will be able to apply prompt caching and deduplication to image
84
+ # tokens as well as text tokens.
85
+ #
86
+ # Image token IDs are assigned sequentially, however, so two ExLlamaV2Embedding objects created from the same
87
+ # source image will not be recognized as the same image for purposes of prompt caching etc.
88
+
89
+ prompt = "[INST]{{IMAGE_1}}{{IMAGE_2}}\n " + \
90
+ "What are the similarities and differences between these two experiments?[/INST]"
91
+
92
+ # Generate
93
+
94
+ streaming = True
95
+ greedy = True
96
+
97
+ if streaming :
98
+
99
+ input_ids = tokenizer .encode (
100
+ prompt ,
101
+ add_bos = True ,
102
+ encode_special_tokens = True ,
103
+ embeddings = image_embeddings ,
104
+ )
105
+
106
+ job = ExLlamaV2DynamicJob (
107
+ input_ids = input_ids ,
108
+ max_new_tokens = 500 ,
109
+ decode_special_tokens = True ,
110
+ stop_conditions = [tokenizer .eos_token_id ],
111
+ gen_settings = ExLlamaV2Sampler .Settings .greedy () if greedy else None ,
112
+ embeddings = image_embeddings ,
113
+ )
114
+
115
+ generator .enqueue (job )
116
+
117
+ print ()
118
+ print (prompt , end = "" ); sys .stdout .flush ()
119
+
120
+ eos = False
121
+ while generator .num_remaining_jobs ():
122
+ results = generator .iterate ()
123
+ for result in results :
124
+ text = result .get ("text" , "" )
125
+ print (text , end = "" ); sys .stdout .flush ()
126
+
127
+ print ()
128
+
129
+ else :
130
+
131
+ output = generator .generate (
132
+ prompt = prompt ,
133
+ max_new_tokens = 500 ,
134
+ add_bos = True ,
135
+ encode_special_tokens = True ,
136
+ decode_special_tokens = True ,
137
+ stop_conditions = [tokenizer .eos_token_id ],
138
+ gen_settings = ExLlamaV2Sampler .Settings .greedy () if greedy else None ,
139
+ embeddings = image_embeddings ,
140
+ )
141
+
142
+ print (output )
0 commit comments