13
13
PROMPT_TEMPLATE = (
14
14
"<|begin_of_text|><|start_header_id|>user<|end_header_id|>\n \n "
15
15
"(<image>./</image>)\n What is in the image?<|eot_id|>"
16
- "<|start_header_id|>assistant<|end_header_id|>\n \n "
17
- )
16
+ "<|start_header_id|>assistant<|end_header_id|>\n \n " )
18
17
19
18
IMAGE_ASSETS = [
20
19
ImageAsset ("stop_sign" ),
@@ -35,20 +34,18 @@ def do_sample(llm: vllm.LLM, lora_path: str, lora_id: int) -> List[str]:
35
34
stop_token_ids = [128001 , 128009 ], # eos_id, eot_id
36
35
)
37
36
38
- inputs = [
39
- {
40
- "prompt" : PROMPT_TEMPLATE ,
41
- "multi_modal_data" : {"image" : asset .pil_image },
42
- }
43
- for asset in IMAGE_ASSETS
44
- ]
37
+ inputs = [{
38
+ "prompt" : PROMPT_TEMPLATE ,
39
+ "multi_modal_data" : {
40
+ "image" : asset .pil_image
41
+ },
42
+ } for asset in IMAGE_ASSETS ]
45
43
46
44
outputs = llm .generate (
47
45
inputs ,
48
46
sampling_params ,
49
47
lora_request = LoRARequest (str (lora_id ), lora_id , lora_path )
50
- if lora_id
51
- else None ,
48
+ if lora_id else None ,
52
49
)
53
50
# Print the outputs.
54
51
generated_texts : List [str ] = []
@@ -59,13 +56,14 @@ def do_sample(llm: vllm.LLM, lora_path: str, lora_id: int) -> List[str]:
59
56
print (f"Prompt: { prompt !r} , Generated text: { generated_text !r} " )
60
57
return generated_texts
61
58
59
+
62
60
@multi_gpu_test (num_gpus = 2 )
63
61
@pytest .mark .parametrize ("fully_sharded" , [True , False ])
64
62
def test_minicpmv_tp2 (minicpmv_lora_files , fully_sharded ):
65
63
llm = vllm .LLM (
66
64
MODEL_PATH ,
67
65
enable_lora = True ,
68
- max_num_seqs = 16 ,
66
+ max_num_seqs = 2 ,
69
67
max_loras = 4 ,
70
68
max_lora_rank = 64 ,
71
69
tensor_parallel_size = 2 ,
@@ -85,7 +83,7 @@ def test_minicpmv_tp4(minicpmv_lora_files, fully_sharded):
85
83
llm = vllm .LLM (
86
84
MODEL_PATH ,
87
85
enable_lora = True ,
88
- max_num_seqs = 16 ,
86
+ max_num_seqs = 2 ,
89
87
max_loras = 4 ,
90
88
max_lora_rank = 64 ,
91
89
tensor_parallel_size = 4 ,
0 commit comments