From d41f4c5b92f9b72e9bf8ad92204f60216160f4ad Mon Sep 17 00:00:00 2001 From: sang Date: Wed, 24 Jul 2024 22:30:36 -0700 Subject: [PATCH 01/36] wip --- vllm/config.py | 2 ++ vllm/core/scheduler.py | 51 ++++++++++++++++++++++--------------- vllm/envs.py | 6 +++++ vllm/executor/ray_utils.py | 4 +-- vllm/sequence.py | 23 +++++++++++++++++ vllm/worker/model_runner.py | 2 ++ vllm/worker/worker.py | 43 +++++++++++++++++++++++++++++-- 7 files changed, 106 insertions(+), 25 deletions(-) diff --git a/vllm/config.py b/vllm/config.py index 9d60f075792..29d63c5cff7 100644 --- a/vllm/config.py +++ b/vllm/config.py @@ -5,6 +5,7 @@ import torch from transformers import PretrainedConfig +import vllm.envs as envs from vllm.logger import init_logger from vllm.model_executor.layers.quantization import QUANTIZATION_METHODS @@ -811,6 +812,7 @@ def __init__(self, self.chunked_prefill_enabled = enable_chunked_prefill self.embedding_mode = embedding_mode self.preemption_mode = preemption_mode + self._use_delta = envs.VLLM_USE_DELTA_INPUT self._verify_args() def _verify_args(self) -> None: diff --git a/vllm/core/scheduler.py b/vllm/core/scheduler.py index 6e59c5e0f74..39675656712 100644 --- a/vllm/core/scheduler.py +++ b/vllm/core/scheduler.py @@ -13,7 +13,8 @@ from vllm.lora.request import LoRARequest from vllm.prompt_adapter.request import PromptAdapterRequest from vllm.sequence import (Sequence, SequenceData, SequenceGroup, - SequenceGroupMetadata, SequenceStatus) + SequenceGroupMetadata, SequenceStatus, + SequenceGroupMetadataDecode) logger = init_logger(__name__) @@ -1018,26 +1019,34 @@ def schedule(self) -> Tuple[List[SequenceGroupMetadata], SchedulerOutputs]: # It assumes the scheduled_seq_groups is ordered by # prefill < decoding. is_prompt = seq_group.is_prefill() - seq_group_metadata = SequenceGroupMetadata( - request_id=seq_group.request_id, - is_prompt=is_prompt, - seq_data=seq_data, - sampling_params=seq_group.sampling_params, - block_tables=block_tables, - do_sample=do_sample, - pooling_params=seq_group.pooling_params, - token_chunk_size=token_chunk_size, - lora_request=seq_group.lora_request, - computed_block_nums=common_computed_block_nums, - state=seq_group.state, - # `multi_modal_data` will only be present for the 1st comm - # between engine and worker. - # the subsequent comms can still use delta, but - # `multi_modal_data` will be None. - multi_modal_data=seq_group.multi_modal_data - if scheduler_outputs.num_prefill_groups > 0 else None, - prompt_adapter_request=seq_group.prompt_adapter_request, - ) + if is_prompt: + seq_group_metadata = SequenceGroupMetadata( + request_id=seq_group.request_id, + is_prompt=is_prompt, + seq_data=seq_data, + sampling_params=seq_group.sampling_params, + block_tables=block_tables, + do_sample=do_sample, + pooling_params=seq_group.pooling_params, + token_chunk_size=token_chunk_size, + lora_request=seq_group.lora_request, + computed_block_nums=common_computed_block_nums, + state=seq_group.state, + # `multi_modal_data` will only be present for the 1st comm + # between engine and worker. + # the subsequent comms can still use delta, but + # `multi_modal_data` will be None. + multi_modal_data=seq_group.multi_modal_data + if scheduler_outputs.num_prefill_groups > 0 else None, + prompt_adapter_request=seq_group.prompt_adapter_request, + ) + else: + seq_group_metadata = SequenceGroupMetadataDecode( + seq_group.request_id, + block_tables, + do_sample=do_sample, + token_chunk_size=token_chunk_size, + ) seq_group_metadata_list.append(seq_group_metadata) # Now that the batch has been created, we can assume all blocks in the diff --git a/vllm/envs.py b/vllm/envs.py index 595992e51db..6cca09847d7 100644 --- a/vllm/envs.py +++ b/vllm/envs.py @@ -35,6 +35,7 @@ VLLM_XLA_CACHE_PATH: str = os.path.join(VLLM_CACHE_ROOT, "xla_cache") VLLM_FUSED_MOE_CHUNK_SIZE: int = 64 * 1024 VLLM_USE_RAY_SPMD_WORKER: bool = False + VLLM_USE_DELTA_INPUT: bool = False VLLM_USE_RAY_COMPILED_DAG: bool = False VLLM_WORKER_MULTIPROC_METHOD: str = "fork" VLLM_ASSETS_CACHE: str = os.path.join(VLLM_CACHE_ROOT, "assets") @@ -269,6 +270,11 @@ def get_default_config_root(): "VLLM_USE_RAY_SPMD_WORKER": lambda: bool(os.getenv("VLLM_USE_RAY_SPMD_WORKER", 0)), + # When enabled, vLLM sends delta input to workers instead of + # an entire data. THIS ENV VAR WILL BE REMOVED SOON. + "VLLM_USE_DELTA_INPUT": + lambda: bool(os.getenv("VLLM_USE_DELTA_INPUT", 0)), + # If the env var is set, it uses the Ray's compiled DAG API # which optimizes the control plane overhead. # Run vLLM with VLLM_USE_RAY_COMPILED_DAG=1 to enable it. diff --git a/vllm/executor/ray_utils.py b/vllm/executor/ray_utils.py index fcbfa30d7a3..24bcbbaeb08 100644 --- a/vllm/executor/ray_utils.py +++ b/vllm/executor/ray_utils.py @@ -1,8 +1,8 @@ -from typing import List, Optional, Tuple +from typing import List, Optional, Tuple, Dict from vllm.config import ParallelConfig from vllm.logger import init_logger -from vllm.sequence import ExecuteModelRequest +from vllm.sequence import ExecuteModelRequest, SequenceGroupMetadata from vllm.utils import get_ip, is_hip, is_xpu from vllm.worker.worker_base import WorkerWrapperBase diff --git a/vllm/sequence.py b/vllm/sequence.py index 0cd4c7e71d7..810af450207 100644 --- a/vllm/sequence.py +++ b/vllm/sequence.py @@ -621,6 +621,22 @@ def __repr__(self) -> str: f"num_seqs={len(self.seqs_dict)})") +class SequenceGroupMetadataDecode: + """Delta sequence group metadata.""" + + def __init__( + self, + request_id: str, + block_tables: Dict[int, List[int]], + do_sample: bool = True, + token_chunk_size: Optional[int] = None, + ) -> None: + self.request_id = request_id + self.block_tables = block_tables + self.token_chunk_size = token_chunk_size + self.do_sample = do_sample + + class SequenceGroupMetadata: """Metadata for a sequence group. Used to create `AttentionMetadata`. @@ -719,6 +735,13 @@ def token_chunk_size(self) -> int: assert self._token_chunk_size is not None return self._token_chunk_size + def apply_delta( + self, sequence_group_metadata_decode: SequenceGroupMetadataDecode): + self.request_id = sequence_group_metadata_decode.request_id + self.block_tables = sequence_group_metadata_decode.block_tables + self._token_chunk_size = sequence_group_metadata_decode.token_chunk_size + self.do_sample = sequence_group_metadata_decode.do_sample + class SequenceOutput: """The model output associated with a sequence. diff --git a/vllm/worker/model_runner.py b/vllm/worker/model_runner.py index 31e9fc1eed5..a1ec439d44c 100644 --- a/vllm/worker/model_runner.py +++ b/vllm/worker/model_runner.py @@ -545,6 +545,8 @@ def __init__( self.flashinfer_prefill_workspace_buffer = None self.flashinfer_prefill_wrapper = None + self.seq_group_metadata_cache: Dict[str, SequenceGroupMetadata] = {} + set_cpu_offload_max_bytes( int(self.cache_config.cpu_offload_gb * 1024**3)) diff --git a/vllm/worker/worker.py b/vllm/worker/worker.py index f3c379d1aa3..e2470cf16f3 100644 --- a/vllm/worker/worker.py +++ b/vllm/worker/worker.py @@ -1,7 +1,7 @@ """A GPU worker class.""" import gc import os -from typing import List, Optional, Set, Tuple, Type +from typing import List, Optional, Set, Tuple, Type, Dict import torch import torch.distributed @@ -18,7 +18,7 @@ from vllm.model_executor.model_loader.tensorizer import TensorizerConfig from vllm.platforms import current_platform from vllm.prompt_adapter.request import PromptAdapterRequest -from vllm.sequence import ExecuteModelRequest +from vllm.sequence import ExecuteModelRequest, SamplerOutput, SequenceGroupMetadata, SequenceGroupMetadataDecode from vllm.worker.cache_engine import CacheEngine from vllm.worker.embedding_model_runner import EmbeddingModelRunner from vllm.worker.model_runner import GPUModelRunnerBase, ModelRunner @@ -31,6 +31,8 @@ class Worker(LocalOrDistributedWorkerBase): Each worker is associated with a single GPU. The worker is responsible for maintaining the KV cache and executing the model on the GPU. In case of distributed inference, each worker is assigned a partition of the model. + + The worker manages the state of SequenceGroupMetadata. """ def __init__( @@ -106,6 +108,7 @@ def __init__( self.cache_engine: List[CacheEngine] # Initialize gpu_cache as embedding models don't initialize kv_caches self.gpu_cache: Optional[List[List[torch.Tensor]]] = None + self._seq_group_metadata_cache: Dict[str, SequenceGroupMetadata] = {} def init_device(self) -> None: if self.device_config.device.type == "cuda": @@ -290,6 +293,42 @@ def execute_worker(self, worker_input: WorkerInput) -> None: and worker_input.blocks_to_copy.numel() > 0): self.cache_engine[virtual_engine].copy(worker_input.blocks_to_copy) + def _get_cached_seq_group_metadata(self, seq_group_metadata_list): + """In-place update execute_model_req based on a cached """ + new_seq_group_metadata_list = [] + for metadata_or_delta in seq_group_metadata_list: + request_id = metadata_or_delta.request_id + if request_id not in self._seq_group_metadata_cache: + assert isinstance(metadata_or_delta, SequenceGroupMetadata) + self._seq_group_metadata_cache[request_id] = metadata_or_delta + else: + assert isinstance(metadata_or_delta, + SequenceGroupMetadataDecode) + self._seq_group_metadata_cache[request_id].apply_delta( + metadata_or_delta) + new_seq_group_metadata_list.append( + self._seq_group_metadata_cache[request_id]) + return new_seq_group_metadata_list + + def execute_model( + self, + execute_model_req: Optional[ExecuteModelRequest] = None + ) -> Optional[List[SamplerOutput]]: + if execute_model_req is not None: + new_seq_group_metadata_list = self._get_cached_seq_group_metadata( + execute_model_req.seq_group_metadata_list) + execute_model_req.seq_group_metadata_list = new_seq_group_metadata_list + return super().execute_model(execute_model_req) + + def _execute_model_spmd( + self, execute_model_req: ExecuteModelRequest + ) -> Optional[List[SamplerOutput]]: + if execute_model_req is not None: + new_seq_group_metadata_list = self._get_cached_seq_group_metadata( + execute_model_req.seq_group_metadata_list) + execute_model_req.seq_group_metadata_list = new_seq_group_metadata_list + return super()._execute_model_spmd(execute_model_req) + def add_lora(self, lora_request: LoRARequest) -> bool: return self.model_runner.add_lora(lora_request) From 5741a83a30eff9f401f60b7985cecdcf97a67a63 Mon Sep 17 00:00:00 2001 From: sang Date: Wed, 24 Jul 2024 22:57:42 -0700 Subject: [PATCH 02/36] fix original arch issue --- tests/prompts/example.txt | 9 +-------- vllm/sequence.py | 4 +++- vllm/worker/worker.py | 2 ++ 3 files changed, 6 insertions(+), 9 deletions(-) diff --git a/tests/prompts/example.txt b/tests/prompts/example.txt index e1b97bc6eee..6e8c45b673e 100644 --- a/tests/prompts/example.txt +++ b/tests/prompts/example.txt @@ -1,8 +1 @@ -vLLM is a high-throughput and memory-efficient inference and serving engine for LLMs. -Briefly describe the major milestones in the development of artificial intelligence from 1950 to 2020. -Compare and contrast artificial intelligence with human intelligence in terms of processing information. -Describe the basic components of a neural network and how it can be trained. -Write a short story about a robot that dreams for the first time. -Analyze the impact of the COVID-19 pandemic on global economic structures and future business models. -Explain the cultural significance of the Mona Lisa painting, and how its perception might vary in Western versus Eastern societies. -Translate the following English sentence into Japanese, French, and Swahili: 'The early bird catches the worm.' +vLLM is a high-throughput and memory-efficient inference and serving engine for LLMs. \ No newline at end of file diff --git a/vllm/sequence.py b/vllm/sequence.py index 810af450207..0decfce7183 100644 --- a/vllm/sequence.py +++ b/vllm/sequence.py @@ -229,7 +229,8 @@ def __repr__(self) -> str: return (f"SequenceData(" f"prompt_token_ids={self._prompt_token_ids}, " f"output_token_ids={self._output_token_ids}, " - f"cumulative_logprob={self.cumulative_logprob})") + f"cumulative_logprob={self.cumulative_logprob}, " + f"get_num_computed_tokens={self.get_num_computed_tokens()}") class Sequence: @@ -741,6 +742,7 @@ def apply_delta( self.block_tables = sequence_group_metadata_decode.block_tables self._token_chunk_size = sequence_group_metadata_decode.token_chunk_size self.do_sample = sequence_group_metadata_decode.do_sample + self.is_prompt = False class SequenceOutput: diff --git a/vllm/worker/worker.py b/vllm/worker/worker.py index e2470cf16f3..3940c76fb7e 100644 --- a/vllm/worker/worker.py +++ b/vllm/worker/worker.py @@ -314,6 +314,8 @@ def execute_model( self, execute_model_req: Optional[ExecuteModelRequest] = None ) -> Optional[List[SamplerOutput]]: + # breakpoint() + # print(execute_model_req.seq_group_metadata_list[0].seq_data) if execute_model_req is not None: new_seq_group_metadata_list = self._get_cached_seq_group_metadata( execute_model_req.seq_group_metadata_list) From d31d73f940eb73a20ba1d4c19460b991beaf920d Mon Sep 17 00:00:00 2001 From: sang Date: Wed, 24 Jul 2024 23:33:27 -0700 Subject: [PATCH 03/36] should work now. --- .../test_basic_distributed_correctness.py | 2 +- vllm/config.py | 2 +- vllm/core/scheduler.py | 9 +++-- vllm/envs.py | 4 +-- vllm/executor/ray_utils.py | 4 +-- vllm/sequence.py | 33 +++++++++++++++++++ vllm/worker/worker.py | 19 ++++++----- 7 files changed, 57 insertions(+), 16 deletions(-) diff --git a/tests/distributed/test_basic_distributed_correctness.py b/tests/distributed/test_basic_distributed_correctness.py index 7a0e5673b2c..7f2ad6cc728 100644 --- a/tests/distributed/test_basic_distributed_correctness.py +++ b/tests/distributed/test_basic_distributed_correctness.py @@ -47,7 +47,7 @@ def test_models( # will hurt multiprocessing backend with fork method (the default method). with vllm_runner(model, dtype=dtype, - tensor_parallel_size=2, + tensor_parallel_size=1, distributed_executor_backend=distributed_executor_backend ) as vllm_model: vllm_outputs = vllm_model.generate_greedy(example_prompts, max_tokens) diff --git a/vllm/config.py b/vllm/config.py index 29d63c5cff7..b43d5a05372 100644 --- a/vllm/config.py +++ b/vllm/config.py @@ -5,8 +5,8 @@ import torch from transformers import PretrainedConfig -import vllm.envs as envs +import vllm.envs as envs from vllm.logger import init_logger from vllm.model_executor.layers.quantization import QUANTIZATION_METHODS from vllm.model_executor.models import ModelRegistry diff --git a/vllm/core/scheduler.py b/vllm/core/scheduler.py index 39675656712..81e35f3f1b5 100644 --- a/vllm/core/scheduler.py +++ b/vllm/core/scheduler.py @@ -13,8 +13,8 @@ from vllm.lora.request import LoRARequest from vllm.prompt_adapter.request import PromptAdapterRequest from vllm.sequence import (Sequence, SequenceData, SequenceGroup, - SequenceGroupMetadata, SequenceStatus, - SequenceGroupMetadataDecode) + SequenceGroupMetadata, SequenceGroupMetadataDecode, + SequenceStatus) logger = init_logger(__name__) @@ -1041,7 +1041,12 @@ def schedule(self) -> Tuple[List[SequenceGroupMetadata], SchedulerOutputs]: prompt_adapter_request=seq_group.prompt_adapter_request, ) else: + seq_data_delta = {} + for id, data in seq_data.items(): + seq_data_delta[id] = data.get_delta() + seq_group_metadata = SequenceGroupMetadataDecode( + seq_data_delta, seq_group.request_id, block_tables, do_sample=do_sample, diff --git a/vllm/envs.py b/vllm/envs.py index 6cca09847d7..65b3fe43411 100644 --- a/vllm/envs.py +++ b/vllm/envs.py @@ -268,7 +268,7 @@ def get_default_config_root(): # execution on all workers. # Run vLLM with VLLM_USE_RAY_SPMD_WORKER=1 to enable it. "VLLM_USE_RAY_SPMD_WORKER": - lambda: bool(os.getenv("VLLM_USE_RAY_SPMD_WORKER", 0)), + lambda: bool(int(os.getenv("VLLM_USE_RAY_SPMD_WORKER", 0))), # When enabled, vLLM sends delta input to workers instead of # an entire data. THIS ENV VAR WILL BE REMOVED SOON. @@ -279,7 +279,7 @@ def get_default_config_root(): # which optimizes the control plane overhead. # Run vLLM with VLLM_USE_RAY_COMPILED_DAG=1 to enable it. "VLLM_USE_RAY_COMPILED_DAG": - lambda: bool(os.getenv("VLLM_USE_RAY_COMPILED_DAG", 0)), + lambda: bool(int(os.getenv("VLLM_USE_RAY_COMPILED_DAG", 0))), # Use dedicated multiprocess context for workers. # Both spawn and fork work diff --git a/vllm/executor/ray_utils.py b/vllm/executor/ray_utils.py index 24bcbbaeb08..fcbfa30d7a3 100644 --- a/vllm/executor/ray_utils.py +++ b/vllm/executor/ray_utils.py @@ -1,8 +1,8 @@ -from typing import List, Optional, Tuple, Dict +from typing import List, Optional, Tuple from vllm.config import ParallelConfig from vllm.logger import init_logger -from vllm.sequence import ExecuteModelRequest, SequenceGroupMetadata +from vllm.sequence import ExecuteModelRequest from vllm.utils import get_ip, is_hip, is_xpu from vllm.worker.worker_base import WorkerWrapperBase diff --git a/vllm/sequence.py b/vllm/sequence.py index 0decfce7183..71eda7403c5 100644 --- a/vllm/sequence.py +++ b/vllm/sequence.py @@ -100,6 +100,16 @@ class RequestMetrics: finished_time: Optional[float] = None +class SequenceDataDelta: + + def __init__(self, new_output_token_ids, new_cumulative_logprob, + new_num_computed_tokens, new_stage): + self.new_output_token_ids = new_output_token_ids + self.new_cumulative_logprob = new_cumulative_logprob + self.new_num_computed_tokens = new_num_computed_tokens + self.new_stage = new_stage + + class SequenceData: """Data associated with a sequence. @@ -129,6 +139,9 @@ def __init__( self._num_computed_tokens = 0 self._stage: SequenceStage = SequenceStage.PREFILL + # New output tokens appended. Used to get delta input. + self._new_appended_tokens: List[int] = [] + self._update_cached_all_tokens() def _update_cached_all_tokens(self): @@ -156,6 +169,7 @@ def output_token_ids(self, new_output_token_ids) -> None: def append_token_id(self, token_id: int, logprob: float) -> None: self._output_token_ids.append(token_id) + self._new_appended_tokens.append(token_id) self._cached_all_token_ids.append(token_id) self.cumulative_logprob += logprob @@ -221,6 +235,21 @@ def get_prompt_token_ids(self) -> Tuple[int, ...]: def get_output_token_ids(self) -> Tuple[int, ...]: return self.output_token_ids + def get_delta(self) -> SequenceDataDelta: + delta = SequenceDataDelta(self._new_appended_tokens, + self.cumulative_logprob, + self.get_num_computed_tokens(), self.stage) + # Reset delta state. + self._new_appended_tokens = [] + return delta + + def apply_delta(self, delta: SequenceDataDelta): + self._num_computed_tokens = delta.new_num_computed_tokens + self.cumulative_logprob = delta.new_cumulative_logprob + self._stage = delta.new_stage + self._output_token_ids.extend(delta.new_output_token_ids) + self._cached_all_token_ids.extend(delta.new_output_token_ids) + @property def stage(self) -> SequenceStage: return self._stage @@ -627,11 +656,13 @@ class SequenceGroupMetadataDecode: def __init__( self, + seq_data_delta: Dict[int, SequenceDataDelta], request_id: str, block_tables: Dict[int, List[int]], do_sample: bool = True, token_chunk_size: Optional[int] = None, ) -> None: + self.seq_data_delta = seq_data_delta self.request_id = request_id self.block_tables = block_tables self.token_chunk_size = token_chunk_size @@ -738,6 +769,8 @@ def token_chunk_size(self) -> int: def apply_delta( self, sequence_group_metadata_decode: SequenceGroupMetadataDecode): + for id, delta in sequence_group_metadata_decode.seq_data_delta.items(): + self.seq_data[id].apply_delta(delta) self.request_id = sequence_group_metadata_decode.request_id self.block_tables = sequence_group_metadata_decode.block_tables self._token_chunk_size = sequence_group_metadata_decode.token_chunk_size diff --git a/vllm/worker/worker.py b/vllm/worker/worker.py index 3940c76fb7e..7ae5e7fd877 100644 --- a/vllm/worker/worker.py +++ b/vllm/worker/worker.py @@ -1,7 +1,7 @@ """A GPU worker class.""" import gc import os -from typing import List, Optional, Set, Tuple, Type, Dict +from typing import Dict, List, Optional, Set, Tuple, Type import torch import torch.distributed @@ -18,7 +18,8 @@ from vllm.model_executor.model_loader.tensorizer import TensorizerConfig from vllm.platforms import current_platform from vllm.prompt_adapter.request import PromptAdapterRequest -from vllm.sequence import ExecuteModelRequest, SamplerOutput, SequenceGroupMetadata, SequenceGroupMetadataDecode +from vllm.sequence import (ExecuteModelRequest, SamplerOutput, + SequenceGroupMetadata, SequenceGroupMetadataDecode) from vllm.worker.cache_engine import CacheEngine from vllm.worker.embedding_model_runner import EmbeddingModelRunner from vllm.worker.model_runner import GPUModelRunnerBase, ModelRunner @@ -314,13 +315,13 @@ def execute_model( self, execute_model_req: Optional[ExecuteModelRequest] = None ) -> Optional[List[SamplerOutput]]: - # breakpoint() - # print(execute_model_req.seq_group_metadata_list[0].seq_data) if execute_model_req is not None: new_seq_group_metadata_list = self._get_cached_seq_group_metadata( execute_model_req.seq_group_metadata_list) - execute_model_req.seq_group_metadata_list = new_seq_group_metadata_list - return super().execute_model(execute_model_req) + execute_model_req.seq_group_metadata_list = ( + new_seq_group_metadata_list) + output = super().execute_model(execute_model_req) + return output def _execute_model_spmd( self, execute_model_req: ExecuteModelRequest @@ -328,8 +329,10 @@ def _execute_model_spmd( if execute_model_req is not None: new_seq_group_metadata_list = self._get_cached_seq_group_metadata( execute_model_req.seq_group_metadata_list) - execute_model_req.seq_group_metadata_list = new_seq_group_metadata_list - return super()._execute_model_spmd(execute_model_req) + execute_model_req.seq_group_metadata_list = ( + new_seq_group_metadata_list) + output = super()._execute_model_spmd(execute_model_req) + return output def add_lora(self, lora_request: LoRARequest) -> bool: return self.model_runner.add_lora(lora_request) From 36e786d66d108c3a78ec85d222b451dcc34ca4aa Mon Sep 17 00:00:00 2001 From: sang Date: Wed, 24 Jul 2024 23:38:11 -0700 Subject: [PATCH 04/36] working --- vllm/core/scheduler.py | 4 +++- vllm/worker/worker.py | 5 ----- 2 files changed, 3 insertions(+), 6 deletions(-) diff --git a/vllm/core/scheduler.py b/vllm/core/scheduler.py index 81e35f3f1b5..c9747b5dfdb 100644 --- a/vllm/core/scheduler.py +++ b/vllm/core/scheduler.py @@ -6,6 +6,7 @@ from dataclasses import dataclass, field from typing import Deque, Dict, Iterable, List, Optional, Set, Tuple, Union +import vllm.envs as envs from vllm.config import CacheConfig, LoRAConfig, SchedulerConfig from vllm.core.interfaces import AllocStatus, BlockSpaceManager from vllm.core.policy import Policy, PolicyFactory @@ -1019,7 +1020,7 @@ def schedule(self) -> Tuple[List[SequenceGroupMetadata], SchedulerOutputs]: # It assumes the scheduled_seq_groups is ordered by # prefill < decoding. is_prompt = seq_group.is_prefill() - if is_prompt: + if is_prompt or not envs.VLLM_USE_RAY_SPMD_WORKER: seq_group_metadata = SequenceGroupMetadata( request_id=seq_group.request_id, is_prompt=is_prompt, @@ -1041,6 +1042,7 @@ def schedule(self) -> Tuple[List[SequenceGroupMetadata], SchedulerOutputs]: prompt_adapter_request=seq_group.prompt_adapter_request, ) else: + # Delta is used only for spmd workers. seq_data_delta = {} for id, data in seq_data.items(): seq_data_delta[id] = data.get_delta() diff --git a/vllm/worker/worker.py b/vllm/worker/worker.py index 7ae5e7fd877..ac473d3791d 100644 --- a/vllm/worker/worker.py +++ b/vllm/worker/worker.py @@ -315,11 +315,6 @@ def execute_model( self, execute_model_req: Optional[ExecuteModelRequest] = None ) -> Optional[List[SamplerOutput]]: - if execute_model_req is not None: - new_seq_group_metadata_list = self._get_cached_seq_group_metadata( - execute_model_req.seq_group_metadata_list) - execute_model_req.seq_group_metadata_list = ( - new_seq_group_metadata_list) output = super().execute_model(execute_model_req) return output From 71e40c16dfa7a7ec9b0b24da4b92035800a6587e Mon Sep 17 00:00:00 2001 From: sang Date: Wed, 24 Jul 2024 23:53:33 -0700 Subject: [PATCH 05/36] . --- vllm/executor/ray_gpu_executor.py | 3 +++ 1 file changed, 3 insertions(+) diff --git a/vllm/executor/ray_gpu_executor.py b/vllm/executor/ray_gpu_executor.py index e4aaeaa24c1..eff20cc4a3d 100644 --- a/vllm/executor/ray_gpu_executor.py +++ b/vllm/executor/ray_gpu_executor.py @@ -48,6 +48,9 @@ def _init_executor(self) -> None: assert self.use_ray_compiled_dag, ( "VLLM_USE_RAY_SPMD_WORKER=1 requires " "VLLM_USE_RAY_COMPILED_DAG=1") + assert self.parallel_config.tensor_parallel_size > 1, ( + "VLLM_USE_RAY_SPMD_WORKER=1 doesn't work with TP size 1." + ) assert self.uses_ray placement_group = self.parallel_config.placement_group From 7e692424587c5695bf41d896d846beee8cda362b Mon Sep 17 00:00:00 2001 From: sang Date: Thu, 25 Jul 2024 16:08:02 -0700 Subject: [PATCH 06/36] pickle --- vllm/engine/llm_engine.py | 16 ++++++ vllm/executor/ray_gpu_executor.py | 10 ++-- vllm/executor/ray_utils.py | 7 ++- vllm/sequence.py | 82 +++++++++++++++++++++++-------- 4 files changed, 87 insertions(+), 28 deletions(-) diff --git a/vllm/engine/llm_engine.py b/vllm/engine/llm_engine.py index eabe3b23a9d..b293c0033e9 100644 --- a/vllm/engine/llm_engine.py +++ b/vllm/engine/llm_engine.py @@ -63,6 +63,9 @@ def _load_generation_config_dict(model_config: ModelConfig) -> Dict[str, Any]: return config.to_diff_dict() +last_input = None +last_output = None + _O = TypeVar("_O", RequestOutput, EmbeddingRequestOutput) @@ -910,6 +913,19 @@ def step(self) -> List[Union[RequestOutput, EmbeddingRequestOutput]]: finished_requests_ids=finished_requests_ids) output = self.model_executor.execute_model( execute_model_req=execute_model_req) + + # global last_input, last_output + # import pickle + # this_input = pickle.dumps(execute_model_req) + # this_output = pickle.dumps(output) + # print(f"{len(this_input)=}, {len(this_output)=}") + # from Levenshtein import distance + # if last_input is not None: + # input_distance = distance(this_input, last_input) + # output_distance = distance(this_output, last_output) + # print(f"{input_distance=}, {output_distance=}") + # last_input = this_input + # last_output = this_output else: output = [] diff --git a/vllm/executor/ray_gpu_executor.py b/vllm/executor/ray_gpu_executor.py index eff20cc4a3d..08a90619b1c 100644 --- a/vllm/executor/ray_gpu_executor.py +++ b/vllm/executor/ray_gpu_executor.py @@ -48,9 +48,6 @@ def _init_executor(self) -> None: assert self.use_ray_compiled_dag, ( "VLLM_USE_RAY_SPMD_WORKER=1 requires " "VLLM_USE_RAY_COMPILED_DAG=1") - assert self.parallel_config.tensor_parallel_size > 1, ( - "VLLM_USE_RAY_SPMD_WORKER=1 doesn't work with TP size 1." - ) assert self.uses_ray placement_group = self.parallel_config.placement_group @@ -278,8 +275,11 @@ def execute_model( if self.forward_dag is None: self.forward_dag = self._compiled_ray_dag(enable_asyncio=False) - outputs = ray.get(self.forward_dag.execute(execute_model_req)) - return outputs[0] + import pickle + serialized_data = pickle.dumps(execute_model_req) + + outputs = ray.get(self.forward_dag.execute(serialized_data)) + return pickle.loads(outputs[0]) def _run_workers( self, diff --git a/vllm/executor/ray_utils.py b/vllm/executor/ray_utils.py index fcbfa30d7a3..851f45bd22f 100644 --- a/vllm/executor/ray_utils.py +++ b/vllm/executor/ray_utils.py @@ -31,9 +31,11 @@ def get_node_and_gpu_ids(self) -> Tuple[str, List[int]]: gpu_ids = ray.get_gpu_ids() return node_id, gpu_ids - def execute_model_spmd(self, execute_model_req: ExecuteModelRequest): + def execute_model_spmd(self, execute_model_req: bytes): """Used only when SPMD worker and compiled DAG are both enabled.""" + import pickle + execute_model_req: ExecuteModelRequest = pickle.loads(execute_model_req) # TODO(swang): This is needed right now because Ray aDAG executes # on a background thread, so we need to reset torch's current # device. @@ -42,7 +44,8 @@ def execute_model_spmd(self, execute_model_req: ExecuteModelRequest): torch.cuda.set_device(self.worker.device) self.compiled_dag_cuda_device_set = True - return self.worker._execute_model_spmd(execute_model_req) + output = self.worker._execute_model_spmd(execute_model_req) + return pickle.dumps(output) ray_import_err = None diff --git a/vllm/sequence.py b/vllm/sequence.py index 71eda7403c5..1251ed61f48 100644 --- a/vllm/sequence.py +++ b/vllm/sequence.py @@ -1024,30 +1024,32 @@ def prune(self, self.seq_ids = seq_ids -@dataclass class ExecuteModelRequest: """The model execution request, containing CPU metadata only. The LLM engine should create an instance of this class for each request batch.""" - # The sequence group metadata list. - seq_group_metadata_list: List[SequenceGroupMetadata] - # Blocks to swap in. List of CPU -> GPU block number. - blocks_to_swap_in: List[Tuple[int, int]] = field(default_factory=list) - # Blocks to swap out. List of GPU -> CPU block number. - blocks_to_swap_out: List[Tuple[int, int]] = field(default_factory=list) - # Blocks to copy. Source to dest block. - blocks_to_copy: List[Tuple[int, int]] = field(default_factory=list) - # Virtual engine ID for pipeline parallel. - virtual_engine: int = 0 - # The number of slots for lookahead decoding. - num_lookahead_slots: int = 0 - # The number of requests in the running queue. - running_queue_size: int = 0 - # Optional hidden states from prior step. - previous_hidden_states: Optional[HiddenStates] = None - # The number of forward steps to run. - num_steps: int = 1 - # Finished request ids since last step. - finished_requests_ids: List[str] = field(default_factory=list) + def __init__( + self, + seq_group_metadata_list: List[SequenceGroupMetadata], + blocks_to_swap_in: List[Tuple[int, int]]=None, + blocks_to_swap_out: List[Tuple[int, int]]=None, + blocks_to_copy: List[Tuple[int, int]]=None, + virtual_engine: int = 0, + num_lookahead_slots: int = 0, + running_queue_size: int = 0, + previous_hidden_states: Optional[HiddenStates] = None, + num_steps: int = 1, + finished_requests_ids: List[str] = None + ): + self.seq_group_metadata_list = seq_group_metadata_list + self.blocks_to_swap_in = blocks_to_swap_in or [] + self.blocks_to_swap_out = blocks_to_swap_out or [] + self.blocks_to_copy = blocks_to_copy or [] + self.virtual_engine = virtual_engine + self.num_lookahead_slots = num_lookahead_slots + self.running_queue_size = running_queue_size + self.previous_hidden_states = previous_hidden_states + self.num_steps = num_steps + self.finished_requests_ids = finished_requests_ids or [] def clone( self, seq_group_metadata_list: List[SequenceGroupMetadata] @@ -1064,3 +1066,41 @@ def clone( previous_hidden_states=self.previous_hidden_states, num_steps=self.num_steps, finished_requests_ids=self.finished_requests_ids) + + + # # The sequence group metadata list. + # seq_group_metadata_list: List[SequenceGroupMetadata] + # # Blocks to swap in. List of CPU -> GPU block number. + # blocks_to_swap_in: List[Tuple[int, int]] = field(default_factory=list) + # # Blocks to swap out. List of GPU -> CPU block number. + # blocks_to_swap_out: List[Tuple[int, int]] = field(default_factory=list) + # # Blocks to copy. Source to dest block. + # blocks_to_copy: List[Tuple[int, int]] = field(default_factory=list) + # # Virtual engine ID for pipeline parallel. + # virtual_engine: int = 0 + # # The number of slots for lookahead decoding. + # num_lookahead_slots: int = 0 + # # The number of requests in the running queue. + # running_queue_size: int = 0 + # # Optional hidden states from prior step. + # previous_hidden_states: Optional[HiddenStates] = None + # # The number of forward steps to run. + # num_steps: int = 1 + # # Finished request ids since last step. + # finished_requests_ids: List[str] = field(default_factory=list) + + # def clone( + # self, seq_group_metadata_list: List[SequenceGroupMetadata] + # ) -> "ExecuteModelRequest": + # """Clone the request with a new sequence group metadata list.""" + # return ExecuteModelRequest( + # seq_group_metadata_list=seq_group_metadata_list, + # blocks_to_swap_in=self.blocks_to_swap_in.copy(), + # blocks_to_swap_out=self.blocks_to_swap_out.copy(), + # blocks_to_copy=self.blocks_to_copy.copy(), + # virtual_engine=self.virtual_engine, + # num_lookahead_slots=self.num_lookahead_slots, + # running_queue_size=self.running_queue_size, + # previous_hidden_states=self.previous_hidden_states, + # num_steps=self.num_steps, + # finished_requests_ids=self.finished_requests_ids) From 0de9f2379f995b5b3877b76642a9a87ecd7f88fa Mon Sep 17 00:00:00 2001 From: sang Date: Fri, 26 Jul 2024 17:07:12 -0700 Subject: [PATCH 07/36] msgpack optimization --- requirements-common.txt | 1 + vllm/core/scheduler.py | 21 +- vllm/engine/llm_engine.py | 2 +- vllm/executor/ray_gpu_executor.py | 18 +- vllm/executor/ray_utils.py | 17 +- vllm/sampling_params.py | 121 +++++------- vllm/sequence.py | 263 +++++++++---------------- vllm/spec_decode/batch_expansion.py | 2 +- vllm/spec_decode/spec_decode_worker.py | 5 +- vllm/worker/model_runner.py | 7 +- 10 files changed, 194 insertions(+), 263 deletions(-) diff --git a/requirements-common.txt b/requirements-common.txt index 29643cfce16..2fbb5977c16 100644 --- a/requirements-common.txt +++ b/requirements-common.txt @@ -22,3 +22,4 @@ outlines >= 0.0.43, < 0.1 # Requires torch >= 2.1.0 typing_extensions filelock >= 3.10.4 # filelock starts to support `mode` argument from 3.10.4 pyzmq +msgspec \ No newline at end of file diff --git a/vllm/core/scheduler.py b/vllm/core/scheduler.py index c9747b5dfdb..c4b26106136 100644 --- a/vllm/core/scheduler.py +++ b/vllm/core/scheduler.py @@ -1028,18 +1028,18 @@ def schedule(self) -> Tuple[List[SequenceGroupMetadata], SchedulerOutputs]: sampling_params=seq_group.sampling_params, block_tables=block_tables, do_sample=do_sample, - pooling_params=seq_group.pooling_params, + # pooling_params=seq_group.pooling_params, token_chunk_size=token_chunk_size, - lora_request=seq_group.lora_request, + # lora_request=seq_group.lora_request, computed_block_nums=common_computed_block_nums, - state=seq_group.state, - # `multi_modal_data` will only be present for the 1st comm - # between engine and worker. - # the subsequent comms can still use delta, but - # `multi_modal_data` will be None. - multi_modal_data=seq_group.multi_modal_data - if scheduler_outputs.num_prefill_groups > 0 else None, - prompt_adapter_request=seq_group.prompt_adapter_request, + # state=seq_group.state, + # # `multi_modal_data` will only be present for the 1st comm + # # between engine and worker. + # # the subsequent comms can still use delta, but + # # `multi_modal_data` will be None. + # multi_modal_data=seq_group.multi_modal_data + # if scheduler_outputs.num_prefill_groups > 0 else None, + # prompt_adapter_request=seq_group.prompt_adapter_request, ) else: # Delta is used only for spmd workers. @@ -1063,7 +1063,6 @@ def schedule(self) -> Tuple[List[SequenceGroupMetadata], SchedulerOutputs]: for scheduled_seq_group in scheduler_outputs.scheduled_seq_groups: self.block_manager.mark_blocks_as_computed( scheduled_seq_group.seq_group) - return seq_group_metadata_list, scheduler_outputs def fork_seq(self, parent_seq: Sequence, child_seq: Sequence) -> None: diff --git a/vllm/engine/llm_engine.py b/vllm/engine/llm_engine.py index b293c0033e9..3b6f6103fd4 100644 --- a/vllm/engine/llm_engine.py +++ b/vllm/engine/llm_engine.py @@ -221,7 +221,6 @@ def __init__( cache_config.enable_prefix_caching, ) # TODO(woosuk): Print more configs in debug mode. - self.model_config = model_config self.cache_config = cache_config self.lora_config = lora_config @@ -899,6 +898,7 @@ def step(self) -> List[Union[RequestOutput, EmbeddingRequestOutput]]: "as performance will be severely degraded otherwise.") seq_group_metadata_list, scheduler_outputs = self.scheduler[ 0].schedule() + # print("SANG-TODO batch size,", len(seq_group_metadata_list)) if not scheduler_outputs.is_empty(): finished_requests_ids = self.scheduler[ diff --git a/vllm/executor/ray_gpu_executor.py b/vllm/executor/ray_gpu_executor.py index 08a90619b1c..9bda55b59ca 100644 --- a/vllm/executor/ray_gpu_executor.py +++ b/vllm/executor/ray_gpu_executor.py @@ -3,6 +3,9 @@ from collections import defaultdict from itertools import islice, repeat from typing import TYPE_CHECKING, Any, Dict, List, Optional, Tuple +import time +import msgspec +import pickle import vllm.envs as envs from vllm.executor.distributed_gpu_executor import ( # yapf: disable @@ -34,6 +37,7 @@ def _init_executor(self) -> None: # Run vLLM with VLLM_USE_RAY_COMPILED_DAG=1 to enable it. # Currently, this requires USE_RAY_SPMD_WORKER=True. self.use_ray_compiled_dag = envs.VLLM_USE_RAY_COMPILED_DAG + self.i = 0 # If the env var is set, then we do not distinguish between the # "driver worker" vs other workers. Also, the rank 0 worker will # be executed in a remote Ray worker. Currently this requires @@ -61,6 +65,7 @@ def _init_executor(self) -> None: self._init_workers_ray(placement_group) self.forward_dag: Optional["ray.dag.CompiledDAG"] = None + self.encoder = msgspec.msgpack.Encoder() def _configure_ray_workers_use_nsight(self, ray_remote_kwargs) -> Dict[str, Any]: @@ -275,11 +280,18 @@ def execute_model( if self.forward_dag is None: self.forward_dag = self._compiled_ray_dag(enable_asyncio=False) - import pickle - serialized_data = pickle.dumps(execute_model_req) + s = time.time() + # serialized_data = pickle.dumps(execute_model_req) + serialized_data = self.encoder.encode(execute_model_req) + # print(f"SANG-TODO input serialization takes {(time.time() - s) * 1000} ms index: {self.i}") + import sys + # print("SANG-TODO size: ,", sys.getsizeof(serialized_data)) outputs = ray.get(self.forward_dag.execute(serialized_data)) - return pickle.loads(outputs[0]) + output = pickle.loads(outputs[0]) + # print(f"SANG-TODO e2e takes {(time.time() - s) * 1000} ms index: {self.i}") + self.i += 1 + return output def _run_workers( self, diff --git a/vllm/executor/ray_utils.py b/vllm/executor/ray_utils.py index 851f45bd22f..1cc7f3a82d9 100644 --- a/vllm/executor/ray_utils.py +++ b/vllm/executor/ray_utils.py @@ -1,4 +1,7 @@ from typing import List, Optional, Tuple +import time +import pickle +import msgspec from vllm.config import ParallelConfig from vllm.logger import init_logger @@ -22,6 +25,8 @@ def __init__(self, *args, **kwargs) -> None: # The flag indicates is set_device is called on # that thread. self.compiled_dag_cuda_device_set = False + self.i = 0 + self.decoder = msgspec.msgpack.Decoder(ExecuteModelRequest) def get_node_ip(self) -> str: return get_ip() @@ -34,8 +39,10 @@ def get_node_and_gpu_ids(self) -> Tuple[str, List[int]]: def execute_model_spmd(self, execute_model_req: bytes): """Used only when SPMD worker and compiled DAG are both enabled.""" - import pickle - execute_model_req: ExecuteModelRequest = pickle.loads(execute_model_req) + s = time.time() + execute_model_req: ExecuteModelRequest = self.decoder.decode(execute_model_req) + # execute_model_req: ExecuteModelRequest = pickle.loads(execute_model_req) + # print(f"SANG-TODO input deserialization takes {(time.time() - s) * 1000} ms index: {self.i}") # TODO(swang): This is needed right now because Ray aDAG executes # on a background thread, so we need to reset torch's current # device. @@ -43,9 +50,11 @@ def execute_model_spmd(self, execute_model_req: bytes): if not self.compiled_dag_cuda_device_set: torch.cuda.set_device(self.worker.device) self.compiled_dag_cuda_device_set = True - output = self.worker._execute_model_spmd(execute_model_req) - return pickle.dumps(output) + output = pickle.dumps(output) + # print(f"SANG-TODO worker takes {(time.time() - s) * 1000} ms index: {self.i}") + self.i += 1 + return output ray_import_err = None diff --git a/vllm/sampling_params.py b/vllm/sampling_params.py index ebe5e0fd341..f4a90dfa8c7 100644 --- a/vllm/sampling_params.py +++ b/vllm/sampling_params.py @@ -2,11 +2,12 @@ import copy from enum import IntEnum from functools import cached_property -from typing import Any, Callable, Dict, List, Optional, Union +from typing import Any, Callable, Dict, List, Optional, Union, Set import torch from pydantic import Field from typing_extensions import Annotated +import msgspec import vllm.envs as envs from vllm.logger import init_logger @@ -33,7 +34,7 @@ class SamplingType(IntEnum): to sample from.""" -class SamplingParams: +class SamplingParams(msgspec.Struct, omit_defaults=True, array_like=False): """Sampling parameters for text generation. Overall, we follow the sampling parameters from the OpenAI text completion @@ -111,81 +112,63 @@ class SamplingParams: (i.e., no truncation). """ - def __init__( - self, - n: int = 1, - best_of: Optional[int] = None, - presence_penalty: float = 0.0, - frequency_penalty: float = 0.0, - repetition_penalty: float = 1.0, - temperature: float = 1.0, - top_p: float = 1.0, - top_k: int = -1, - min_p: float = 0.0, - seed: Optional[int] = None, - use_beam_search: bool = False, - length_penalty: float = 1.0, - early_stopping: Union[bool, str] = False, - stop: Optional[Union[str, List[str]]] = None, - stop_token_ids: Optional[List[int]] = None, - include_stop_str_in_output: bool = False, - ignore_eos: bool = False, - max_tokens: Optional[int] = 16, - min_tokens: int = 0, - logprobs: Optional[int] = None, - prompt_logprobs: Optional[int] = None, - detokenize: bool = True, - skip_special_tokens: bool = True, - spaces_between_special_tokens: bool = True, - logits_processors: Optional[List[LogitsProcessor]] = None, - truncate_prompt_tokens: Optional[Annotated[int, Field(ge=1)]] = None, - ) -> None: - self.n = n - self.best_of = best_of if best_of is not None else n - self.presence_penalty = presence_penalty - self.frequency_penalty = frequency_penalty - self.repetition_penalty = repetition_penalty - self.temperature = temperature - self.top_p = top_p - self.top_k = top_k - self.min_p = min_p - if seed == -1: + n: int = 1 + best_of: Optional[int] = None + presence_penalty: float = 0.0 + frequency_penalty: float = 0.0 + repetition_penalty: float = 1.0 + temperature: float = 1.0 + top_p: float = 1.0 + top_k: int = -1 + min_p: float = 0.0 + seed: Optional[int] = None + use_beam_search: bool = False + length_penalty: float = 1.0 + early_stopping: Union[bool, str] = False + stop: Optional[Union[str, List[str]]] = None + stop_token_ids: Optional[List[int]] = None + ignore_eos: bool = False + max_tokens: Optional[int] = 16 + min_tokens: int = 0 + logprobs: Optional[int] = None + prompt_logprobs: Optional[int] = None + # NOTE: This parameter is only exposed at the engine level for now. + # It is not exposed in the OpenAI API server, as the OpenAI API does + # not support returning only a list of token IDs. + detokenize: bool = True + skip_special_tokens: bool = True + spaces_between_special_tokens: bool = True + # logits_processors: Optional[List[LogitsProcessor]] = None + logits_processors: Optional[Any] = None + include_stop_str_in_output: bool = False + truncate_prompt_tokens: Optional[Annotated[int, msgspec.Meta(ge=1)]] = None + + # The below fields are not supposed to be used as an input. + # They are set in post_init. + output_text_buffer_length: int = 0 + all_stop_token_ids: Optional[Set[int]] = None + + def __post_init__(self) -> None: + self.best_of = self.best_of or self.n + if self.seed == -1: self.seed = None else: - self.seed = seed - self.use_beam_search = use_beam_search - self.length_penalty = length_penalty - self.early_stopping = early_stopping - if stop is None: + self.seed = self.seed + if self.stop is None: self.stop = [] - elif isinstance(stop, str): - self.stop = [stop] + elif isinstance(self.stop, str): + self.stop = [self.stop] else: - self.stop = list(stop) - if stop_token_ids is None: + self.stop = list(self.stop) + if self.stop_token_ids is None: self.stop_token_ids = [] else: - self.stop_token_ids = list(stop_token_ids) - self.ignore_eos = ignore_eos - self.max_tokens = max_tokens - self.min_tokens = min_tokens - self.logprobs = logprobs - self.prompt_logprobs = prompt_logprobs - # NOTE: This parameter is only exposed at the engine level for now. - # It is not exposed in the OpenAI API server, as the OpenAI API does - # not support returning only a list of token IDs. - self.detokenize = detokenize - self.skip_special_tokens = skip_special_tokens - self.spaces_between_special_tokens = spaces_between_special_tokens - self.logits_processors = logits_processors - self.include_stop_str_in_output = include_stop_str_in_output - self.truncate_prompt_tokens = truncate_prompt_tokens + self.stop_token_ids = list(self.stop_token_ids) + # Number of characters to hold back for stop string evaluation # until sequence is finished. - if self.stop and not include_stop_str_in_output: + if self.stop and not self.include_stop_str_in_output: self.output_text_buffer_length = max(len(s) for s in self.stop) - 1 - else: - self.output_text_buffer_length = 0 self._verify_args() if self.use_beam_search: @@ -322,7 +305,7 @@ def update_from_generation_config( eos_ids.update(self.stop_token_ids) self.stop_token_ids = list(eos_ids) - @cached_property + @property def sampling_type(self) -> SamplingType: if self.use_beam_search: return SamplingType.BEAM diff --git a/vllm/sequence.py b/vllm/sequence.py index 1251ed61f48..a1aec53fb9a 100644 --- a/vllm/sequence.py +++ b/vllm/sequence.py @@ -6,10 +6,11 @@ from collections import defaultdict from dataclasses import dataclass, field from typing import (TYPE_CHECKING, Dict, List, Mapping, Optional, Set, Tuple, - Union) + Union, Any) import torch +import msgspec from vllm.lora.request import LoRARequest from vllm.pooling_params import PoolingParams from vllm.prompt_adapter.request import PromptAdapterRequest @@ -100,17 +101,14 @@ class RequestMetrics: finished_time: Optional[float] = None -class SequenceDataDelta: +class SequenceDataDelta(msgspec.Struct, array_like=True, omit_defaults=True): + new_output_token_ids: List[int] + new_cumulative_logprob: float + new_num_computed_tokens: int + new_stage: SequenceStage - def __init__(self, new_output_token_ids, new_cumulative_logprob, - new_num_computed_tokens, new_stage): - self.new_output_token_ids = new_output_token_ids - self.new_cumulative_logprob = new_cumulative_logprob - self.new_num_computed_tokens = new_num_computed_tokens - self.new_stage = new_stage - -class SequenceData: +class SequenceData(msgspec.Struct, array_like=False, omit_defaults=True): """Data associated with a sequence. Args: @@ -123,29 +121,27 @@ class SequenceData: output_token_ids: The token IDs of the output. cumulative_logprob: The cumulative log probability of the output. """ - - def __init__( + prompt_token_ids: List[int] + _output_token_ids: Optional[List[int]] = msgspec.field(default_factory=list) + cumulative_logprob: float = 0.0 + + ## The below fields should not be passed as an argument ## + _prompt_token_ids_tuple: Optional[Tuple[int, ...]] = None + # The number of tokens that are computed (that run against the model). + _num_computed_tokens: int = 0 + _stage: SequenceStage = SequenceStage.PREFILL + # New output tokens appended. Used to get delta input. + _new_appended_tokens: List[int] = msgspec.field(default_factory=list) + _cached_all_token_ids: List[int] = msgspec.field(default_factory=list) + + def __post_init__( self, - prompt_token_ids: List[int], - output_token_ids: Optional[List[int]] = None, ) -> None: - self._prompt_token_ids: List[int] = list(prompt_token_ids) - self._prompt_token_ids_tuple: Tuple[int, ...] = tuple(prompt_token_ids) - self._output_token_ids: List[int] = ( - list(output_token_ids) if output_token_ids is not None else []) - - self.cumulative_logprob = 0.0 - # The number of tokens that are computed (that run against the model). - self._num_computed_tokens = 0 - self._stage: SequenceStage = SequenceStage.PREFILL - - # New output tokens appended. Used to get delta input. - self._new_appended_tokens: List[int] = [] - + self._prompt_token_ids_tuple = tuple(self.prompt_token_ids) self._update_cached_all_tokens() def _update_cached_all_tokens(self): - self._cached_all_token_ids: List[int] = (self._prompt_token_ids + + self._cached_all_token_ids = (self.prompt_token_ids + self._output_token_ids) @property @@ -154,7 +150,7 @@ def prompt_token_ids(self) -> Tuple[int, ...]: @prompt_token_ids.setter def prompt_token_ids(self, new_prompt_token_ids) -> None: - self._prompt_token_ids = list(new_prompt_token_ids) + self.prompt_token_ids = list(new_prompt_token_ids) self._prompt_token_ids_tuple = tuple(new_prompt_token_ids) self._update_cached_all_tokens() @@ -174,10 +170,10 @@ def append_token_id(self, token_id: int, logprob: float) -> None: self.cumulative_logprob += logprob def get_len(self) -> int: - return len(self._output_token_ids) + len(self._prompt_token_ids) + return len(self._output_token_ids) + len(self.prompt_token_ids) def get_prompt_len(self) -> int: - return len(self._prompt_token_ids) + return len(self.prompt_token_ids) def get_output_len(self) -> int: return len(self._output_token_ids) @@ -226,14 +222,14 @@ def get_num_uncomputed_tokens(self) -> int: def get_last_token_id(self) -> int: if not self._output_token_ids: - return self._prompt_token_ids[-1] + return self.prompt_token_ids[-1] return self._output_token_ids[-1] def get_prompt_token_ids(self) -> Tuple[int, ...]: return self.prompt_token_ids def get_output_token_ids(self) -> Tuple[int, ...]: - return self.output_token_ids + return self._output_token_ids def get_delta(self) -> SequenceDataDelta: delta = SequenceDataDelta(self._new_appended_tokens, @@ -256,8 +252,8 @@ def stage(self) -> SequenceStage: def __repr__(self) -> str: return (f"SequenceData(" - f"prompt_token_ids={self._prompt_token_ids}, " - f"output_token_ids={self._output_token_ids}, " + f"prompt_token_ids={self.prompt_token_ids}, " + f"output_token_ids={self.output_token_ids}, " f"cumulative_logprob={self.cumulative_logprob}, " f"get_num_computed_tokens={self.get_num_computed_tokens()}") @@ -651,25 +647,17 @@ def __repr__(self) -> str: f"num_seqs={len(self.seqs_dict)})") -class SequenceGroupMetadataDecode: +class SequenceGroupMetadataDecode(msgspec.Struct, tag=True, array_like=True, omit_defaults=True): """Delta sequence group metadata.""" - def __init__( - self, - seq_data_delta: Dict[int, SequenceDataDelta], - request_id: str, - block_tables: Dict[int, List[int]], - do_sample: bool = True, - token_chunk_size: Optional[int] = None, - ) -> None: - self.seq_data_delta = seq_data_delta - self.request_id = request_id - self.block_tables = block_tables - self.token_chunk_size = token_chunk_size - self.do_sample = do_sample + seq_data_delta: Dict[int, SequenceDataDelta] + request_id: str + block_tables: Dict[int, List[int]] + do_sample: bool = True + token_chunk_size: Optional[int] = None -class SequenceGroupMetadata: +class SequenceGroupMetadata(msgspec.Struct, tag=True, array_like=True, omit_defaults=True): """Metadata for a sequence group. Used to create `AttentionMetadata`. Args: @@ -701,51 +689,36 @@ class SequenceGroupMetadata: prompt_adapter_request: Prompt Adapter request. """ - def __init__( - self, - request_id: str, - is_prompt: bool, - seq_data: Dict[int, SequenceData], - sampling_params: SamplingParams, - block_tables: Dict[int, List[int]], - do_sample: bool = True, - pooling_params: Optional[PoolingParams] = None, - token_chunk_size: Optional[int] = None, - lora_request: Optional[LoRARequest] = None, - computed_block_nums: Optional[List[int]] = None, - state: Optional[SequenceGroupState] = None, - multi_modal_data: Optional["MultiModalDataDict"] = None, - encoder_seq_data: Optional[SequenceData] = None, - cross_block_table: Optional[List[int]] = None, - prompt_adapter_request: Optional[PromptAdapterRequest] = None, - ) -> None: - self.request_id = request_id - self.is_prompt = is_prompt - self.seq_data = seq_data - self.sampling_params = sampling_params - self.block_tables = block_tables - self.pooling_params = pooling_params - self.lora_request = lora_request - self.prompt_adapter_request = prompt_adapter_request - self.computed_block_nums = computed_block_nums - self.multi_modal_data = multi_modal_data - self.state = SequenceGroupState() if state is None else state - self.encoder_seq_data = encoder_seq_data - self.cross_block_table = cross_block_table - self._token_chunk_size = token_chunk_size - self.do_sample = do_sample - - # The number of speculative tokens adopted in this request. - # None means specuative decoding is not used. - # Zero means speculative decoding is disabled for some reasons. - # TODO: We should maintain this states out of the sequence group. - self.num_speculative_tokens = None - - if self._token_chunk_size is None: - if is_prompt: - self._token_chunk_size = list(seq_data.values())[0].get_len() + request_id: str + is_prompt: bool + seq_data: Dict[int, SequenceData] + sampling_params: SamplingParams + block_tables: Dict[int, List[int]] + do_sample: bool = True + # pooling_params: Optional[PoolingParams] = None + # lora_request: Optional[LoRARequest] = None + computed_block_nums: Optional[List[int]] = None + # state: Optional[SequenceGroupState] = None + # # from vllm.multimodal import MultiModalDataDict + multi_modal_data: Optional[Any] = None + # encoder_seq_data: Optional[SequenceData] = None + # cross_block_table: Optional[List[int]] = None + # prompt_adapter_request: Optional[PromptAdapterRequest] = None + token_chunk_size: Optional[int] = None + + ## Stateful fields that are lazily defined. ## + # The number of speculative tokens adopted in this request. + # None means specuative decoding is not used. + # Zero means speculative decoding is disabled for some reasons. + # TODO: We should maintain this states out of the sequence group. + num_speculative_tokens: Optional[int] = None + + def __post_init__(self): + if self.token_chunk_size is None: + if self.is_prompt: + self.token_chunk_size = list(self.seq_data.values())[0].get_len() else: - self._token_chunk_size = 1 + self.token_chunk_size = 1 @property def lora_int_id(self) -> int: @@ -761,11 +734,6 @@ def prompt_adapter_num_virtual_tokens(self) -> int: return self.prompt_adapter_request.prompt_adapter_num_virtual_tokens \ if self.prompt_adapter_request else 0 - @property - def token_chunk_size(self) -> int: - """Return the number of tokens to be processed (chunk size).""" - assert self._token_chunk_size is not None - return self._token_chunk_size def apply_delta( self, sequence_group_metadata_decode: SequenceGroupMetadataDecode): @@ -773,7 +741,7 @@ def apply_delta( self.seq_data[id].apply_delta(delta) self.request_id = sequence_group_metadata_decode.request_id self.block_tables = sequence_group_metadata_decode.block_tables - self._token_chunk_size = sequence_group_metadata_decode.token_chunk_size + self.token_chunk_size = sequence_group_metadata_decode.token_chunk_size self.do_sample = sequence_group_metadata_decode.do_sample self.is_prompt = False @@ -992,7 +960,7 @@ def get_all_seq_ids_and_request_ids( return seq_ids, request_id_seq_ids_mapping -class HiddenStates: +class HiddenStates(msgspec.Struct, array_like=True, omit_defaults=True): """Hidden states corresponding to in-progress sequences. Used in speculative decoding to pass hidden states from the target model to the proposer model in the subsequent step. @@ -1000,11 +968,11 @@ class HiddenStates: seq_ids are the sequence ids of each entry of the batch dimension of the hidden_states tensor""" - def __init__(self, seq_group_metadata_list: List[SequenceGroupMetadata], - hidden_states: torch.Tensor): - assert len(seq_group_metadata_list) == len(hidden_states) - self.seq_ids: List[int] = get_all_seq_ids(seq_group_metadata_list) - self.hidden_states: torch.Tensor = hidden_states + seq_ids: List[int] + hidden_states: torch.Tensor + + def __post_init__(self): + assert len(self.seq_group_metadata_list) == len(self.hidden_states) def update(self, seq_group_metadata_list: List[SequenceGroupMetadata], hidden_states: torch.Tensor) -> None: @@ -1024,32 +992,29 @@ def prune(self, self.seq_ids = seq_ids -class ExecuteModelRequest: +class ExecuteModelRequest(msgspec.Struct, array_like=True, omit_defaults=True): """The model execution request, containing CPU metadata only. The LLM engine should create an instance of this class for each request batch.""" - def __init__( - self, - seq_group_metadata_list: List[SequenceGroupMetadata], - blocks_to_swap_in: List[Tuple[int, int]]=None, - blocks_to_swap_out: List[Tuple[int, int]]=None, - blocks_to_copy: List[Tuple[int, int]]=None, - virtual_engine: int = 0, - num_lookahead_slots: int = 0, - running_queue_size: int = 0, - previous_hidden_states: Optional[HiddenStates] = None, - num_steps: int = 1, - finished_requests_ids: List[str] = None - ): - self.seq_group_metadata_list = seq_group_metadata_list - self.blocks_to_swap_in = blocks_to_swap_in or [] - self.blocks_to_swap_out = blocks_to_swap_out or [] - self.blocks_to_copy = blocks_to_copy or [] - self.virtual_engine = virtual_engine - self.num_lookahead_slots = num_lookahead_slots - self.running_queue_size = running_queue_size - self.previous_hidden_states = previous_hidden_states - self.num_steps = num_steps - self.finished_requests_ids = finished_requests_ids or [] + # The sequence group metadata list. + seq_group_metadata_list: List[Union[SequenceGroupMetadata, SequenceGroupMetadataDecode]] + # Blocks to swap in. List of CPU -> GPU block number. + blocks_to_swap_in: List[Tuple[int, int]] = msgspec.field(default_factory=list) + # Blocks to swap out. List of GPU -> CPU block number. + blocks_to_swap_out: List[Tuple[int, int]] = msgspec.field(default_factory=list) + # Blocks to copy. Source to dest block. + blocks_to_copy: List[Tuple[int, int]] = msgspec.field(default_factory=list) + # Virtual engine ID for pipeline parallel. + virtual_engine: int = 0 + # The number of slots for lookahead decoding. + num_lookahead_slots: int = 0 + # The number of requests in the running queue. + running_queue_size: int = 0 + # Optional hidden states from prior step. + previous_hidden_states: Optional[HiddenStates] = None + # The number of forward steps to run. + num_steps: int = 1 + # Finished request ids since last step. + finished_requests_ids: List[str] = msgspec.field(default_factory=list) def clone( self, seq_group_metadata_list: List[SequenceGroupMetadata] @@ -1066,41 +1031,3 @@ def clone( previous_hidden_states=self.previous_hidden_states, num_steps=self.num_steps, finished_requests_ids=self.finished_requests_ids) - - - # # The sequence group metadata list. - # seq_group_metadata_list: List[SequenceGroupMetadata] - # # Blocks to swap in. List of CPU -> GPU block number. - # blocks_to_swap_in: List[Tuple[int, int]] = field(default_factory=list) - # # Blocks to swap out. List of GPU -> CPU block number. - # blocks_to_swap_out: List[Tuple[int, int]] = field(default_factory=list) - # # Blocks to copy. Source to dest block. - # blocks_to_copy: List[Tuple[int, int]] = field(default_factory=list) - # # Virtual engine ID for pipeline parallel. - # virtual_engine: int = 0 - # # The number of slots for lookahead decoding. - # num_lookahead_slots: int = 0 - # # The number of requests in the running queue. - # running_queue_size: int = 0 - # # Optional hidden states from prior step. - # previous_hidden_states: Optional[HiddenStates] = None - # # The number of forward steps to run. - # num_steps: int = 1 - # # Finished request ids since last step. - # finished_requests_ids: List[str] = field(default_factory=list) - - # def clone( - # self, seq_group_metadata_list: List[SequenceGroupMetadata] - # ) -> "ExecuteModelRequest": - # """Clone the request with a new sequence group metadata list.""" - # return ExecuteModelRequest( - # seq_group_metadata_list=seq_group_metadata_list, - # blocks_to_swap_in=self.blocks_to_swap_in.copy(), - # blocks_to_swap_out=self.blocks_to_swap_out.copy(), - # blocks_to_copy=self.blocks_to_copy.copy(), - # virtual_engine=self.virtual_engine, - # num_lookahead_slots=self.num_lookahead_slots, - # running_queue_size=self.running_queue_size, - # previous_hidden_states=self.previous_hidden_states, - # num_steps=self.num_steps, - # finished_requests_ids=self.finished_requests_ids) diff --git a/vllm/spec_decode/batch_expansion.py b/vllm/spec_decode/batch_expansion.py index 41f0aebf3c0..025f699b419 100644 --- a/vllm/spec_decode/batch_expansion.py +++ b/vllm/spec_decode/batch_expansion.py @@ -283,7 +283,7 @@ def _create_single_target_seq_group_metadata( target_seq_id: SequenceData( prompt_token_ids=prompt_token_ids, - output_token_ids=new_output_token_ids, + _output_token_ids=new_output_token_ids, ), } # This is a hack. Technically, spec decoding should compute diff --git a/vllm/spec_decode/spec_decode_worker.py b/vllm/spec_decode/spec_decode_worker.py index 8cf0aa5b898..b51aa6b47b3 100644 --- a/vllm/spec_decode/spec_decode_worker.py +++ b/vllm/spec_decode/spec_decode_worker.py @@ -462,7 +462,8 @@ def _run_no_spec(self, execute_model_req: ExecuteModelRequest, if hidden_states is not None: if self.previous_hidden_states is None: self.previous_hidden_states = HiddenStates( - execute_model_req.seq_group_metadata_list, hidden_states) + get_all_seq_ids(execute_model_req.seq_group_metadata_list), + hidden_states) else: self.previous_hidden_states.update( execute_model_req.seq_group_metadata_list, hidden_states) @@ -637,7 +638,7 @@ def _verify_tokens( index = accepted_index[:, None, None].expand(-1, 1, hs_size) hidden_states = hidden_states.gather(1, index).squeeze(1) # b x d # Store hidden states from target model for subsequent decode step - self.previous_hidden_states = HiddenStates(seq_group_metadata_list, + self.previous_hidden_states = HiddenStates(get_all_seq_ids(seq_group_metadata_list), hidden_states) return accepted_token_ids, logprobs diff --git a/vllm/worker/model_runner.py b/vllm/worker/model_runner.py index a1ec439d44c..59834168492 100644 --- a/vllm/worker/model_runner.py +++ b/vllm/worker/model_runner.py @@ -748,9 +748,9 @@ def profile_run(self) -> None: seq_data={group_id: seq_data}, sampling_params=sampling_params, block_tables=None, - lora_request=dummy_lora_requests_per_seq[group_id] - if dummy_lora_requests_per_seq else None, - multi_modal_data=dummy_multi_modal_data, + # lora_request=dummy_lora_requests_per_seq[group_id] + # if dummy_lora_requests_per_seq else None, + # multi_modal_data=dummy_multi_modal_data, ) seqs.append(seq) @@ -1192,7 +1192,6 @@ def execute_model( intermediate_tensors=intermediate_tensors, **multi_modal_kwargs, **seqlen_agnostic_kwargs) - # Compute the logits in the last pipeline stage. if not get_pp_group().is_last_rank: return hidden_or_intermediate_states From de4e43ee1147d5d89a91709d054333258381b3a6 Mon Sep 17 00:00:00 2001 From: sang Date: Tue, 30 Jul 2024 11:52:28 -0700 Subject: [PATCH 08/36] ip --- a.py | 29 ++++++++++++++++++++++++++++ tests/prompts/example.txt | 9 ++++++++- vllm/engine/llm_engine.py | 16 ---------------- vllm/executor/ray_gpu_executor.py | 16 +++++++++++++--- vllm/executor/ray_utils.py | 13 +++++++++++-- vllm/sequence.py | 32 +++++++++++++++++-------------- vllm/worker/model_runner.py | 2 -- vllm/worker/worker_base.py | 3 +++ 8 files changed, 82 insertions(+), 38 deletions(-) create mode 100644 a.py diff --git a/a.py b/a.py new file mode 100644 index 00000000000..d277c552d95 --- /dev/null +++ b/a.py @@ -0,0 +1,29 @@ +from array import array +from vllm.sequence import ExecuteModelRequest, SequenceData +import msgspec + +with open('example.bin', 'rb') as file: + data = file.read() + +def dec_hook(type, obj): + # `type` here is the value of the custom type annotation being decoded. + if type is array: + deserialized = array('l') + deserialized.frombytes(obj) + return deserialized + +# decoder = msgspec.msgpack.Decoder(ExecuteModelRequest, dec_hook=dec_hook) + + +# print(decoder.decode(data)) + +def enc_hook(obj): + if isinstance(obj, array): + # convert the complex to a tuple of real, imag + return obj.tobytes() + +encoder = msgspec.msgpack.Encoder(enc_hook=enc_hook) +decoder = msgspec.msgpack.Decoder(SequenceData, dec_hook=dec_hook) + +data = SequenceData([1, 2, 3]) +print(decoder.decode(encoder.encode(data))) diff --git a/tests/prompts/example.txt b/tests/prompts/example.txt index 6e8c45b673e..cef4d1d7687 100644 --- a/tests/prompts/example.txt +++ b/tests/prompts/example.txt @@ -1 +1,8 @@ -vLLM is a high-throughput and memory-efficient inference and serving engine for LLMs. \ No newline at end of file +vLLM is a high-throughput and memory-efficient inference and serving engine for LLMs. +Briefly describe the major milestones in the development of artificial intelligence from 1950 to 2020. +Compare and contrast artificial intelligence with human intelligence in terms of processing information. +Describe the basic components of a neural network and how it can be trained. +Write a short story about a robot that dreams for the first time. +Analyze the impact of the COVID-19 pandemic on global economic structures and future business models. +Explain the cultural significance of the Mona Lisa painting, and how its perception might vary in Western versus Eastern societies. +Translate the following English sentence into Japanese, French, and Swahili: 'The early bird catches the worm.' \ No newline at end of file diff --git a/vllm/engine/llm_engine.py b/vllm/engine/llm_engine.py index 4e22e51aba2..148eb271617 100644 --- a/vllm/engine/llm_engine.py +++ b/vllm/engine/llm_engine.py @@ -63,9 +63,6 @@ def _load_generation_config_dict(model_config: ModelConfig) -> Dict[str, Any]: return config.to_diff_dict() -last_input = None -last_output = None - _O = TypeVar("_O", RequestOutput, EmbeddingRequestOutput) @@ -919,19 +916,6 @@ def step(self) -> List[Union[RequestOutput, EmbeddingRequestOutput]]: finished_requests_ids=finished_requests_ids) output = self.model_executor.execute_model( execute_model_req=execute_model_req) - - # global last_input, last_output - # import pickle - # this_input = pickle.dumps(execute_model_req) - # this_output = pickle.dumps(output) - # print(f"{len(this_input)=}, {len(this_output)=}") - # from Levenshtein import distance - # if last_input is not None: - # input_distance = distance(this_input, last_input) - # output_distance = distance(this_output, last_output) - # print(f"{input_distance=}, {output_distance=}") - # last_input = this_input - # last_output = this_output else: output = [] diff --git a/vllm/executor/ray_gpu_executor.py b/vllm/executor/ray_gpu_executor.py index f81d5c12549..047c05830e6 100644 --- a/vllm/executor/ray_gpu_executor.py +++ b/vllm/executor/ray_gpu_executor.py @@ -6,6 +6,7 @@ import time import msgspec import pickle +from array import array import vllm.envs as envs from vllm.executor.distributed_gpu_executor import ( # yapf: disable @@ -66,7 +67,12 @@ def _init_executor(self) -> None: self._init_workers_ray(placement_group) self.forward_dag: Optional["ray.dag.CompiledDAG"] = None - self.encoder = msgspec.msgpack.Encoder() + def enc_hook(obj: Any) -> Any: + if isinstance(obj, array): + # convert the complex to a tuple of real, imag + return obj.tobytes() + + self.encoder = msgspec.msgpack.Encoder(enc_hook=enc_hook) def _configure_ray_workers_use_nsight(self, ray_remote_kwargs) -> Dict[str, Any]: @@ -283,10 +289,14 @@ def execute_model( s = time.time() # serialized_data = pickle.dumps(execute_model_req) + serialized_data = self.encoder.encode(execute_model_req) + # # Open a file in binary write mode + # with open('example.bin', 'wb') as file: + # # Write bytes to the file + # file.write(serialized_data) + # print(f"SANG-TODO input serialization takes {(time.time() - s) * 1000} ms index: {self.i}") - import sys - # print("SANG-TODO size: ,", sys.getsizeof(serialized_data)) outputs = ray.get(self.forward_dag.execute(serialized_data)) output = pickle.loads(outputs[0]) diff --git a/vllm/executor/ray_utils.py b/vllm/executor/ray_utils.py index c10b8ea3d66..7801242216d 100644 --- a/vllm/executor/ray_utils.py +++ b/vllm/executor/ray_utils.py @@ -1,7 +1,8 @@ -from typing import List, Optional, Tuple +from typing import List, Optional, Tuple, Type, Any import time import pickle import msgspec +from array import array from vllm.config import ParallelConfig from vllm.logger import init_logger @@ -26,7 +27,14 @@ def __init__(self, *args, **kwargs) -> None: # that thread. self.compiled_dag_cuda_device_set = False self.i = 0 - self.decoder = msgspec.msgpack.Decoder(ExecuteModelRequest) + + def dec_hook(type: Type, obj: Any) -> Any: + if type is array: + deserialized = array('l') + deserialized.frombytes(obj) + return deserialized + + self.decoder = msgspec.msgpack.Decoder(ExecuteModelRequest, dec_hook=dec_hook) def get_node_ip(self) -> str: return get_ip() @@ -40,6 +48,7 @@ def execute_model_spmd(self, execute_model_req: bytes): """Used only when SPMD worker and compiled DAG are both enabled.""" s = time.time() + execute_model_req: ExecuteModelRequest = self.decoder.decode(execute_model_req) # execute_model_req: ExecuteModelRequest = pickle.loads(execute_model_req) # print(f"SANG-TODO input deserialization takes {(time.time() - s) * 1000} ms index: {self.i}") diff --git a/vllm/sequence.py b/vllm/sequence.py index 0655e6a7156..313dbee21e7 100644 --- a/vllm/sequence.py +++ b/vllm/sequence.py @@ -122,11 +122,11 @@ class SequenceData(msgspec.Struct, array_like=False, omit_defaults=True): output_token_ids: The token IDs of the output. cumulative_logprob: The cumulative log probability of the output. """ - prompt_token_ids: List[int] - _output_token_ids: Optional[List[int]] = msgspec.field(default_factory=list) - cumulative_logprob: float = 0.0 + _prompt_token_ids: array + _output_token_ids: Optional[array] = None ## The below fields should not be passed as an argument ## + cumulative_logprob: float = 0.0 _prompt_token_ids_tuple: Optional[Tuple[int, ...]] = None # The number of tokens that are computed (that run against the model). _num_computed_tokens: int = 0 @@ -138,21 +138,25 @@ class SequenceData(msgspec.Struct, array_like=False, omit_defaults=True): def __post_init__( self, ) -> None: - self.prompt_token_ids = array('l', self.prompt_token_ids) - self._output_token_ids = array( + if not isinstance(self._prompt_token_ids, array): + self._prompt_token_ids = array('l', self._prompt_token_ids) + if not isinstance(self._output_token_ids, array): + self._output_token_ids = array( 'l', self._output_token_ids if self._output_token_ids is not None else []) - self._prompt_token_ids_tuple: Tuple[int, ...] = tuple(self.prompt_token_ids) + self._prompt_token_ids_tuple: Tuple[int, ...] = tuple(self._prompt_token_ids) self._update_cached_all_tokens() + def _update_cached_all_tokens(self): + self._cached_all_token_ids: List[int] = list(self._prompt_token_ids + + self._output_token_ids) + @property def prompt_token_ids(self) -> Tuple[int, ...]: return self._prompt_token_ids_tuple @prompt_token_ids.setter def prompt_token_ids(self, new_prompt_token_ids) -> None: - self._prompt_token_ids = array('l', new_prompt_token_ids) - self._prompt_token_ids_tuple = tuple(new_prompt_token_ids) - self._update_cached_all_tokens() + raise NotImplementedError @property def prompt_token_ids_array(self) -> array: @@ -178,10 +182,10 @@ def append_token_id(self, token_id: int, logprob: float) -> None: self.cumulative_logprob += logprob def get_len(self) -> int: - return len(self._output_token_ids) + len(self.prompt_token_ids) + return len(self._output_token_ids) + len(self._prompt_token_ids) def get_prompt_len(self) -> int: - return len(self.prompt_token_ids) + return len(self._prompt_token_ids) def get_output_len(self) -> int: return len(self._output_token_ids) @@ -230,11 +234,11 @@ def get_num_uncomputed_tokens(self) -> int: def get_last_token_id(self) -> int: if not self._output_token_ids: - return self.prompt_token_ids[-1] + return self._prompt_token_ids[-1] return self._output_token_ids[-1] def get_prompt_token_ids(self) -> Tuple[int, ...]: - return self.prompt_token_ids + return self._prompt_token_ids def get_output_token_ids(self) -> Tuple[int, ...]: return self._output_token_ids @@ -260,7 +264,7 @@ def stage(self) -> SequenceStage: def __repr__(self) -> str: return (f"SequenceData(" - f"prompt_token_ids={self.prompt_token_ids}, " + f"prompt_token_ids={self._prompt_token_ids}, " f"output_token_ids={self.output_token_ids}, " f"cumulative_logprob={self.cumulative_logprob}, " f"get_num_computed_tokens={self.get_num_computed_tokens()}") diff --git a/vllm/worker/model_runner.py b/vllm/worker/model_runner.py index 0c423e8fed4..b95b89a66d7 100644 --- a/vllm/worker/model_runner.py +++ b/vllm/worker/model_runner.py @@ -713,8 +713,6 @@ def __init__( self.flashinfer_prefill_workspace_buffer = None self.flashinfer_prefill_wrapper = None - self.seq_group_metadata_cache: Dict[str, SequenceGroupMetadata] = {} - set_cpu_offload_max_bytes( int(self.cache_config.cpu_offload_gb * 1024**3)) diff --git a/vllm/worker/worker_base.py b/vllm/worker/worker_base.py index 03e3857e23c..76eebe76f62 100644 --- a/vllm/worker/worker_base.py +++ b/vllm/worker/worker_base.py @@ -282,6 +282,9 @@ def execute_model( # output is List[SamplerOutput] return output +# 1. spmd -> general & used by other backend easier +# 2. + def _execute_model_spmd( self, execute_model_req: ExecuteModelRequest ) -> Optional[List[SamplerOutput]]: From dc7c4459d6d24221b8f227eb5b1441fba67b57d0 Mon Sep 17 00:00:00 2001 From: sang Date: Tue, 30 Jul 2024 15:39:12 -0700 Subject: [PATCH 09/36] . --- a.py | 58 +++++++++++++++++++---- b.py | 72 +++++++++++++++++++++++++++++ benchmarks/benchmark_throughput.py | 1 + example.bin | Bin 0 -> 76206 bytes vllm/core/scheduler.py | 18 ++++++-- vllm/executor/ray_gpu_executor.py | 8 ++-- vllm/sequence.py | 13 +++--- vllm/worker/worker_base.py | 3 -- 8 files changed, 149 insertions(+), 24 deletions(-) create mode 100644 b.py create mode 100644 example.bin diff --git a/a.py b/a.py index d277c552d95..78e769455e4 100644 --- a/a.py +++ b/a.py @@ -1,5 +1,7 @@ +import time +import sys from array import array -from vllm.sequence import ExecuteModelRequest, SequenceData +from vllm.sequence import ExecuteModelRequest, SequenceData, SequenceDataDelta, SequenceStage import msgspec with open('example.bin', 'rb') as file: @@ -11,19 +13,57 @@ def dec_hook(type, obj): deserialized = array('l') deserialized.frombytes(obj) return deserialized - -# decoder = msgspec.msgpack.Decoder(ExecuteModelRequest, dec_hook=dec_hook) - - -# print(decoder.decode(data)) def enc_hook(obj): if isinstance(obj, array): # convert the complex to a tuple of real, imag return obj.tobytes() + +class Timer: + def __init__(self, msg): + self.msg = msg + + def __enter__(self): + self.start = time.time() + return self # This allows access to the instance in the 'as' part of the context manager + + def __exit__(self, exc_type, exc_val, exc_tb): + self.end = time.time() + self.elapsed_us = (self.end - self.start) * 1000 * 1000 + print(f"{self.msg=}. Elapsed time: {self.elapsed_us:.2f} us") + +# encoder = msgspec.msgpack.Encoder(enc_hook=enc_hook) +# decoder = msgspec.msgpack.Decoder(ExecuteModelRequest, dec_hook=dec_hook) + +# with Timer("Serialization"): + # serialized = encoder.encode(data) +# print(f"{sys.getsizeof(data)=}") +# with Timer("Deserialization original"): +# decoder.decode(data) +# with Timer("Deserialization original"): +# data = decoder.decode(data) + +# with Timer("Serialization, big block tables"): +# data = encoder.encode(data) +# with Timer("Deserialization, big block tables"): +# data = decoder.decode(data) + +# for i, metadata in enumerate(data.seq_group_metadata_list): +# for key, value in metadata.block_tables.items(): +# metadata.block_tables[key] = [i] + +# with Timer("Serialization, small block tables"): +# data = encoder.encode(data) +# with Timer("Deserialization, small block tables"): +# data = decoder.decode(data) + +# print(decoder.decode(encoder.encode(data))) encoder = msgspec.msgpack.Encoder(enc_hook=enc_hook) -decoder = msgspec.msgpack.Decoder(SequenceData, dec_hook=dec_hook) +decoder = msgspec.msgpack.Decoder(SequenceDataDelta, dec_hook=dec_hook) -data = SequenceData([1, 2, 3]) -print(decoder.decode(encoder.encode(data))) +data = SequenceDataDelta([i for i in range(2048)], 0, 0, SequenceStage.DECODE) +with Timer("Serialization, big block tables"): + data = encoder.encode(data) +with Timer("Deserialization, big block tables"): + data = decoder.decode(data) diff --git a/b.py b/b.py new file mode 100644 index 00000000000..d8a4a29e6c6 --- /dev/null +++ b/b.py @@ -0,0 +1,72 @@ +import time +from array import array + +def t(): + l = [i for i in range(256)] + s = time.time() + a = array('l') + a.fromlist(l) + print((time.time() - s) * 1000 * 1000, "us") + +t() + + +import msgspec + +def dec_hook(type, obj): + # `type` here is the value of the custom type annotation being decoded. + if type is array: + deserialized = array('l') + deserialized.frombytes(obj) + return deserialized + +def enc_hook(obj): + if isinstance(obj, array): + # convert the complex to a tuple of real, imag + return obj.tobytes() + +class Timer: + def __init__(self, msg): + self.msg = msg + + def __enter__(self): + self.start = time.time() + return self # This allows access to the instance in the 'as' part of the context manager + + def __exit__(self, exc_type, exc_val, exc_tb): + self.end = time.time() + self.elapsed_us = (self.end - self.start) * 1000 * 1000 + print(f"{self.msg=}. Elapsed time: {self.elapsed_us:.2f} us") + +encoder = msgspec.msgpack.Encoder(enc_hook=enc_hook) +decoder = msgspec.msgpack.Decoder(dec_hook=dec_hook) + +l = [i for i in range(256)] +d = {"1": l} + + +with Timer("Serialization array"): + # a = array('l') + # a.fromlist(l) + data = encoder.encode(a) +with Timer("Deserialization"): + data = decoder.decode(data) + +l = [i for i in range(256)] +a = array('l') +a.fromlist(l) + + +with Timer("Serialization bigger array"): + # a = array('l') + # a.fromlist(l) + data = encoder.encode(a) +with Timer("Deserialization"): + data = decoder.decode(data) + + +# for _ in range(5): +# with Timer("Serialization list"): +# data = encoder.encode(l) +# with Timer("Deserialization"): +# data = decoder.decode(data) diff --git a/benchmarks/benchmark_throughput.py b/benchmarks/benchmark_throughput.py index a52e67bbbe7..d473fa9a16b 100644 --- a/benchmarks/benchmark_throughput.py +++ b/benchmarks/benchmark_throughput.py @@ -106,6 +106,7 @@ def run_vllm( max_num_batched_tokens=max_num_batched_tokens, distributed_executor_backend=distributed_executor_backend, load_format=load_format, + max_num_seqs=32, ) # Add the requests to the engine. diff --git a/example.bin b/example.bin new file mode 100644 index 0000000000000000000000000000000000000000..c2b1a978ec099ec59ae2c4e8add10a4dd7857212 GIT binary patch literal 76206 zcma%^b#z@t_xBr5ZW1UCg#vAFZbjPS?oQm3Pdqs;7nc?% za+AROo&4TEvsRw2tUWXPjGgc7J+t@BoZF}T5kEZk`Jt;Wx@^_?r>|H%9|eck+d3(OOL#D8ddgwBI$Jc167 znvq!BV~xhzj?x{mw#6DnWg*r!SX)yf&Q^^h((+bdZHcu7#bRuZwHel?Sewx3QzVY+ z*cfXg+EqwJ3$XIB5?FayxwKSc-?R1b>cTVf2H#mIyX?Kq`LDn zd`63Tp*(|G<`fz>Q)uuJpCT5Jrri%jI{)7v5DS76xn1-g zOz%6SjwDq=-z>`CrD+-cUomj$@OY*Vh3EX4!ZTfXs{Xv?GzJQ@4xpvRgg?=dFyS6l zeWft%XOYAXd?~5;iI#M}lwkZwtziFww&&>lp3d)-CA*H!Z|VGo8a?@%&adc9QmuGQ z)!=J>s!Tbbh~%Y2@_!X6cmnHWtdp=##5zHY+=sA^$2tz{Sgd2Pj>b9)OK={6bvV{x zSchV1xDLjWEGT{u)`2QlcoEhCSo>q`hqW)(KH?=lqVXuYPNZwSQduw3u$D&Cbk?)< z{)gVTX?=-QF?}=XtEJ|Bc#-mthetGHNJM7|(F`M@))Y07QG+#^)^D+PreX}%B+47G zCSpyXT*>iRJ7JB(8cUgyV<;U%>vT%SP&`JZ^Y*5GJ&mEF1@kBxL;ENid^+-}cTE46 z&|tEq{Unmup87G=je*Z;$$(9%9Yfpm)R-LEwN#IxN=-;)Z^XI*>w2vJVqJ%IE!H(y zS7Tj;btTpnG(CoOIo4%ZmttLlb+IbQ{{`zJx=x_$XS(m8`xkoFie1=F??LpwMd}Dr zMbu=_S400CYCmM)lHn1|91_9VLNId>!3)(y!Q)sLV4aV39@e>7=U|f=mDcYlYoKf_rDJKG zLCIK(^;fIxd3(_qQ8bpi^|WtGOD!D*VrTtGeJ2fLsUJ(Z;7mtRYdHBNVg#PqTAA#%jfC!D_~8!fM0SF})r@f_B(SkGXs79;&(tf#S_!g>-*daOz%K#yWQf~7}N z`)|*&wV7-p@8rG{=uVB4Q$1!vsf%OvBi&!sUtyA{G^G)m5(|9bE zJ5f1~ik&Eqs2E52PS`t9J`Q^vW#cGvC~cs0994ysjH7rQjWI>zR61`N?b}hio|Zbg zj-%^GQ|&Ce&AgsDS|tA_diSUI4La77`hu#rX`UcnEsrl^;;b<~cdMhJh6^%YhUD^`~58mh&MutKZ= z%g6G>&ZrW}`G_X7n&+>_DpOb>5iG?j!79co!upbqW9dAS#^Y#Sq33zW(mIP`p|?5q zX4sq3n8eL1U8?%hSJ83~ z^=4YkJy|4i1-<*xx{lPLv~EnLw7QB)Nn^;sg~MZ+J0zC#gk`S5vSvLU>S6|VI(8cN z2&{jx{=pi+>c{F+2eQ7$N@4Yy=dk^d-*o(1u$KDNlSL{XD3$Yhq&sW~e|6_W=^hqXFyNHpgQ%{+sqJ=ZjU9riBRbFgP)&%&OG zJ%i?z)Jhu0VvnIrlCdN94%pjckET?~?Xb6{RryC@Z$pVVGS?fhx5C~MTb6nQjbEwm zygg|jO{ZPI&LRvL#Nur!L_VSP)DlHX9h zlX7KOQDwr63Xz--XnI;!jUeH!+u*r#9{LCo5iuJ7qSiJ}gAo)aTcB9c}| z@9UI@q&`zBGeut|r86-kp2oLH4B~kb_KDahU>}ct9QLu;$6z0geH8YQ z*hgR=u10h3#Xb!CQ0znKP{xDl)N_r-LD&akAAr3-_I}v=V(){!5_<*qa_qfTwD4SP zEzu{?tS6Ib{Q-L-_5|$ll#i!uJf-6)>ZD{m#p7xGnj$3|Xy0B%^On+5PxY@?`#! z=Y8xe6lRoTUygkl_NCaDU|)=V5%z`H7hs={Ej1Km=U|_WZPv^DAF=&@tVXwtrgZ(`AbJ)*fKZC7?pT>R)`$_C4RIczG8c$T1soqVe zay<4l?5Wts{>_qJ$o^GhJ1dob23-cw?DOdUfu3inF#b;7FZ8}j>OZ7DrFkd%%IUwD zexHH!hR3mBC>$3UucpA@STUZ~IclO{SK3We=~)zQO<5mBximCTKY_Xlur+NjQ#*mO zAE}cAFoEg`R4M7xq}K(z0lOZ%P8H<5Cz4TyU5j0VU5#CZt>LM_7XIRwVwYeSV;5n6 ziT#DL=Sw(0$2MwK@SsTU-E^EtrxfCGr3y75k6}NG{RpJ!Q&i)B<~(eE*E?(i5E4vC@h zV+sd#`3~&cRez==?>6jPv2VeafhJ|U3HwIu8?djZVgju{(U48KfSW+6lDkkcf#L}? zenXLfHhx~-9+W5O7zx)XmGv#fxm2#FL618r*0TQ!b}M!Zb~APpO%k|9>{qd0p+hjd zOy{vGmun3A!n0^Jvs>oXR8CTu|2CBqshET_iHeCh`kO%WzBrl@#FT|3&Nv*I^g{3( z8rD#^mY!$m88k?qroNEWfuugB?*dBSrT-%OT}uClwPm9r@iSh`Mh3sKiIh&HbuJ|n zDV|8p2#O}sF3;aA26;>9*aWVn#4G_>>efUPE}~*R^%JRYpl%{;L))v=YJ&S7`#Wqi z{Aq?Hw!xO4tp{IYe}$dIj;U5M!Va+mY@aG6J!}`-!M0T>Te50lcVc&7w_~@_WEO>t zQjwf@Me@~(@zDz;gr8u4jQtVzhu9xrYp!_@`(5mJs1=G|)9^gq-_f(09Htz7(RV(5rSxA&wN{jKhL^&PheXkMF&hs`;oI16VZVv}2KMXNuhDTboyN>al+$<$ z&HGVZNaMGZZ9zo>M}K)Zxi~r0e}|KelZBIslR?L(IO#ZPI3uwC#r_9-0K1=N>2n`; z3a-QM#qPoG#_qzFy)+5iYaLXD#B zHSC|THLElzd)haqvs;T~8^0=l0cD#i%*v=LJYLbp29N?IRj@m zoLzAi;Vi^iK;x-6^Ks_k%*ELSXAaJ6oLM+Cac1C5SE06&muHaA{hh{tNPS4(c}k@pM47B`n}M^2 z$8(b*@iZRICc@LyZ7jzuW8!5kpzU>qnbN!;sGUUFdTJ(7YkH6{TajsBQ$2~QNs4Bd z(X>Wkj`0LDigC`uITJ^CpN_K%=QJE4eF~15;qt%5ISI!su?6>u0(-=Lnp`aSo$HbsUOw2+qMc|G_y3=RlkTaP~Ll?!eg(M-${eI4f~h z;4H`48)q+^WjK4{ETz4KuCJ(QXkNFE`7!HUqHXbz?s9N zx#^H-8lPrUBc0A3I7@IAwId{s7%m4*1Y)cbZka*E&P{;ovC%GH&#W~ zeCl?l?G4zL+MTJ{nbP%QrOf6+DA;fx?U;k({@1jO$$}WX{7m7v~(Dv&G0a>RfOy&TBZY(xDN11?Od)mvCOh zc>!k~j@nv-qdhpyl}ReJCAXG_7pOIoSzu%_>q%-xk$RuLbLcCg|9tvowEhok&Spa* zY5bYZ29Y!~dhYEw&*JE9R^vR4^AygLI8We6VjjbJ6z36~hjAXlc@XCTocmR%@N^ni zQMohDG@Pk8#)!+jf@TTMWGciqIOd%|`OcIH=$$FqnWo(--kGAEsm!NcTMk>(TnE>~ zcFLP=<6MnXic^9kNiV|r66Xt?&v8D(`4s09oR4un!ub&A103Vk<$sOy9*$X!3htp} z71cs8rcS#36+Oo2%*vweFolKk38nvkQ2I6{TD8xkO6$!T!^`64Ln3HAn9cvcX5WTm z)U9xpNRDJiV|@?K-8gsQ+=(Oo5x~lHE6y!AH{;wy2ac-blx+bGY$^WW!rUdOe zoNIBep?tEb?sUo~Q#zTF$uupdcrrzkX&-dyPNZXV<;dHO=6cHeXqZgBDUy|>ZZfr< zw3(G5a{;xJshLc*c(s)NNXdGtlx(0{o~bO?b+rDB(})9315Q0o9gUOeil}>@W`~}~ z!FcPLw}=#u7s;4F>OK0-rteF`|C{~}6Ye)WmRk&orSWFA7?j3ZoEn^JoGP43oC=(B zoHEm-X({JTI!;%ZsG{+7n*W2Vbz=&pQz)52(-MlOP&9?A1m#;OSC(0b^LC{y7jB?o z3iU=rvtpc|shdKZ^z|phm`6!3&W|{ni>KgxkMkYQw>Ywwl$3mZg_FdQy`*G>6XFCa zmzESsY=I-oNe?_67stV|DPE6b;dD|n8mEJ*)zmzzyy>SJaxy)S(OysQvvfXBY9CVX z(svepU(kOp%~KdSeRveN916vS#+TW05XE+?G>~mLtu$$VX`xQ*VzXM$mS#06%rM?Y z&Kn~6dO5So7u=2WHjXfP6G!`cXW*D6D)&~C=OP-7`&W1xl~Zx2QaJ@zf0JoG7&Md}%>KpshdSXrK(o5qkJE=EgGwr) zvFO3+#_7V5K{XZUFPuNAlwOC_u7UgMd6b%$XkSE|4!Bm5dWXIo&t9 zxYdvd8c$}cK?MK6(QcR8{tf3>oL_KcvnjcrHnR|CMKIcgbE%z5%~YzVQZ-eKv{)pu zIaN&*t*7ZlTGvzAAYOK{NP444#^)kA2HbojK?QfwVN9s>bt1X9(0C^9eB61s#$G5q z6?Ye0k8Mtzwr_p?fo@8EX$R#2>`pDi6l0o%!GM`v~EzCpKgc>pND%c z?m4(;)sZ zDAkr9;~t57grWs^(xEpw9QQEXLvatmm4YhkLAVFv9)P<)?tZxY;_idH5_bjea@@Uf z_rhIfx_UG2p14bK_rP6(yBK$O+}&_@#a%?B>RyPuKpFE+r)eqG(r39LThZJ|MIUYN z!VXkVqiUM6q?utz$dVaNlQCm5$KYN@tC>(TKE-_t_f6b4a0fLZqC=zbnx5rMr}|mC zV`}7cKP*Q2@$^2eu;3h8t|j#*MTJ!SLI2s5X#GBIcpOI!iKFp!MhQn#^BuUa;=Y3W zGVV*bFXFy{yAF3P?i$?Zai7C|7WWz4)woaNK85=v?i09=<36Sea&N+Y6!#I_hjAam zeGvBn-1}+NJKcwSFYZ0KcjMlLdnc}1yB+s7l`E7`Nrw-mVjATJv&@Srn?^}MspvFn z^2N(Ii{fb%O{0AhB{G=I{GGQDZl+-x_0wp3kGg5FBeg2v(EKSCnp7>Sh2KL|Nvofv zeRtE^;5^hwTLHzgf6k)+OVi|U!y~xukO&&zX50TmFl!Dq)2P!(zf9{dxMr6r9o$QC zFTuSS_afX2aWB95a=NN2yl zC`^o_?^F8Er2h+w{|8%qyCHEjUd?t!qdT^yT(*iav+@?upgv8f?S1N|Q`$?d>3&u~ z&2(xysh&=iVPr#1EhRtF^a`!N;@08Tik&|Kw;Hz!w-UDkw;Z<&w-mPow-~nw_enIB;8bRMD6py6; za|TWwUJ^$SiJW77hW#3s1yxNW$yp(Ly=)Q`k%#%;oF#FcC};MSXr|I&Jg!i0# z#wdGsr$|~a+)It_R6IbhF|)E)(fb>1$0(XNlD_@u`oQyz^+iQ>l!PL~?XWalXQQQ>MU7Cb>EoH=LZyY1TUO zEH!JWTq{OaF1^OB%GGl7tHL}TBtAgLL-c(_{~7du#=yzLBf7(oh#LQ9hyRn$LLEdM zLD?+R!bx~L;_ZO9J)SHz!^_ziPdi^TDQm;qR*X#ZW;q+-jl$C&n51HByshxI#M^>0 z@HVGd%+2sN#oL5ZjgOKeX_siJE)^`KTty1-^6?UQd3d>aIe6K4S$LUv8F=Y@+hpiH&Lfozm5m3(4o{8B5Ro^p2z9 z6&lQHo-p0d&{5646unEmN3quIlZMA}$02ZBFw=NAI~vLK&cr)|*1Pad$6JMW8s4dR zr{JB8Csd_b#!Ab15$|}@{)c$S;T?;24BpXrN8ueQcJ5PnN8m~O4#PVX?-0C$@l^Ri zdYJFw9f)^;q6N3$?T@z~-oAMI;H|`4fwx>w5}kN^#c$ zG+uzWE8Zg0+JEsD;w`|Nk2eo*F5WJ9bMR*4nUT+!OZWD)ms7Ep(jBPKYV!oWznJ#V zq~&`0_Mz`X`cJ3-Q;PnFwPwta2pSJ(j6tw+7R?5P!sDsX{lG!lJBx~~X`f1S8{AIY z$6yp9?H8Kfpw+Btl{2>_YB@@ zyr=Pm_LF!|;60A_7~Z3JkKjFw_YmHLG+v1J0N(w0_nCHT@$OZadmY|Acz5Gzr0>ML z1Mhac+wgA1)1b&A5O_D?-AJ>>^ae^+(>*An2huZ!p8Mz>OYh_KZlHY+Q{8&{R?_zY z#f9{L!oZ2c%i-7|Q8Yfy*g-ke&dqGqTzH&F=J}N9nn*zLEGo?gMBc#^&7#EE8wGRV z4jKm4elGR1Xw&IiFLkr1HN3=FYBb5|&rvFC2350YdXv^al}d9>1}U=fB@+F#3X%8l z-o<-IjEoONa$dlD8}BVVZIB4bH^fVSAMbU%*YIA&dj;=hyqEB#5z4!c#*36C|5BQz z4p-w{g?A;M%%nM#3DV2)F2gfkWp)GI+fgme-jSYrsn!APV-$O&p3=j@I{H@7r!DeT z^nc923B%(#Zb&?hPcv>1&r2wsL+NZv=1^jQ%_$VgJdffz6wjt|Yl`MjG@JHm6hnoK zr8{Ym4rz5!xBtNV9q%`l$dhXQO1*KLvzAe>XIf`I#oK_Vbw&lW12h}PQ#+fQ*;JdR zvK*=u?WD4UhLKd)Qv4%c6<#G?1ztH`8D1$~37+xg6Mc9^wBC#NCEgc!pW}Uor#3#p z`xx&dybmdsJeo;7e;0aoph_#u74Pto)P=>Ld;%{B8Ysb^M zp+vJ4uZ7mX@S5?O@EY-qVx*Z~B?|Ex@apmE@TB`Sc-4lTG8v3LTewmr&x~w_K`Wz! z=Fe%AmA{r+S(Q6d|BBcJdysm9&X?%hn?{`|o<`A!G}a7{>G&ZrHU7ai^6Yyo= zDY+BASy>B@!5@pSy-e|T6eIIok(>hB9olConqjmcJ(nt7M;MLQg(tg8BK{ZkW}72# z4sD;Ca5iLZ{`vUl;h&3t4*uErXW^fTe}*0wJdb}mt%K`BFa9dA@&;(RTVck#A~|bB@-2}9 z+1zIG%DtLqGcgw)jlTo__V}amWnBs7ZShCpZ-cM>FeSIb-x6QDQ;KhnzZs=%DwmUw zuNyor{wB1~q(~1(;%|(<5q=@QI;#u`{5UuU90)6c=r#?QjfRQ(z1O$L5Cej5G=ynpfjp}B)% z$;$vNl;|^oi~V;_7O(Y+zI#4pcQ^DMVnFc9p$~sku{b2ZE1R!vahHx6JFlN zBAG__(~Q6-BuA&2um}HC{8MPT7yo4ZlkiW(KLP)E{NwO7nP{evFsp+{;y;D|qzdJC z;6H(HgskAd_>bW~ivI{*uhG2?6_Unn>A8!tbb22owIiuVO~%t{xrx5t=|7on-=x-Bm58XKfr$< z|2_P7@g@9k)AcId+Ld{pvbD6=(5s8buaMGy`5Uw^Go@Ynexv^+YIK9@sNqqZJQRwv zj9)W(5XHCf-^70d|8@M=@L$D$1^;C=l<_wHOZYG1zkt6Ee=YtRd@1U4_|M`$gTEU8 zX&Q9^dm`;xm$y)%>>QDTC3I@u`y2l+d>KmfX%M`B;Qx*722Y~zSBl=J(#HuzY1TnD}z~&E~jiBr6zCT5tQmQaXuyUD4tL8JlX?_bg6zm z?XxK|=wzNHl9NYG9<};zZhMNr@5b+d7XF~)8JZ-Atzu;9AogXNN7B40J=@T8 z3BB9Wdq1h|DY0oVoi4nDzF+7+(d6!6;E3UIoH`_q#;ch+h+{8)3r)HB&G=3Djri(h z1AaYz9eyo-4SuznOViX-r7(LNkwhMTC4L2dIV}$u#@`|tZ;Ir%(Rh`@f@=vD5X>i- zM=+OQ7lJtivk7Jq%p{mWAY1cf0$B#iHkDus!DN-oy_`S>lz5W}CK5~_kO8ISP6Xo! z#uCVOQgTOv9hANBaN0xKjct~Brbv!HYFq^RQtl-hcA!+J+MCm(t#++2_mLV+>S5{) zk{S2Wpq=p(s2|0^;ltxNZAcuAH#1E*s=~Z|O|5w}#0t~3!@L@oI(nMzjj9q4^Yrxt9@aPoNzrLAV{kwgjUH%#aoy zMu~V^6KqAWCBYU1+HP7vdql7qT`$tTm2%}wre|wyAl)-Xayd$+HwdI1bGCx z1d3~md;>uiK_;C$5M&Uf6QmK0!2cKjAN&FQeswlO^4^D^!k75>(6E5gji_HheN3C2 z&8gJsCdC4(zoSM}`}11SKcuyfCbf1p!C3@n5}ZMBI>9P}(+EzbnRyQ|6dFoHau=KhQ!VIFEa*l z+k?hy2$m2mCfHr|=eH2-MzE`C>S}^T1Pf`FUAmBph2rI2O8G(pqp^uE2(Bc!fPVq7yAh?6zc7odo zZlziBb_;>F#59^W5!^^{1A$B_CI3rs9l^CKlzWLt;%t$^LqyVb{o18nsOQnOmX2$w zZlz~SdM==MYkKb?HA=a%Po~Hiy@}66@?WO^So;5<{O93uoHZnl#)FylfAeGJ>8dhE zpUM47*=Wi)QYs^xHtk|hqt?usdDCe6nA#><2WZL{BWohTX9Sj>5otRZ-w;5h=b zZJD8r&k(F8c$(lTf+q={Ab6bMF&YKrqso%5JC3VpPtvkGby~;G2~Ebaw3X4dh8DBN zXJ{)hostZCx1zR>W?7DJP^)_YkLh9dtMng3|L+VOGCYd2heXl%F|&;{GMzdN@{a^R z5Xd&uWcVGm;(SZ+joQl5QM!0?`o;T-AW1NYtS+sHt(8aYkRTw?%Cm^3Pw04_)_-U! zpt6&oh~P^CO#`2+u0kWhiQh$XpBKq^)g&E}{3ekCSxu)?xhoZ_Sk}_6l&fyD-sE0v zGM+`5ZeuN?bXQ6jQL-x~izwce;zd-}(f$=h;IsacCSHW+pJ4b89Sb{z(w7{eY zdI@?6x(T`n2CW(`Cx6kP*OWo6ca)Um z5hZ`9jm((@H3ZcJRRom;6$Ir3Wdx-JB?QH^JWBgDH2#-xA>jhT`Gm%T%x@%|OSp@o z1y>NxA)HM(i*P2*XAsUHoK7eMYBxHrqWcKt%4nfybDFoLSATbrGL}N_Q8a%?pRvz# zpQZmO+8<}&Kf@!q%TNf;HvY>lMi#@Vgi{D76UtoMjc^j-M8XM#GS`&ciEtd@SVE&N zxff9;o>>eMXA=M_IVXd9BNbVLFO^A~4kKq8{^0E% z-TM>B9)U~X5ZDA3K_?y3><&u)CTOSiUxGFQ9q1Mkv=C^5Z6atSfS`e(o}kWDKY^f@ zmd6xL7+EM>On9imj91J<`Gbek`UE|j(Q_WXTcEAc)K;YKht<@-qbxbA=|7VGUl}-P zcs%D0iKp>s=Kh~VW*Fx${}w_)wJYHwlQR3J1vQf|r&GeuC7~wU9S5v+_;Z=lJ5=vQRL@9Y0 z;iZITDbBrsaBylool-rxi10!}@nlXZc^=`pgy#^RO|g<^5lT4O``a}~=m zmR0UZN_1{0>qomz+M(Ez)V&nx0^dV)zE1z)G<-;vH#~~-heXkMH1iFLb$Re7S%SflKlKS0Y7InK*yE`?zQ*EB)1R{BpsoI?)UE*6uTN+Jf zBP>f2^%j;g3lK6%2V1?PQ5NdWgi10wAa-ShQfN+1p{RsCZ+=p-_ z;R?d#gnJY2MYxP`Pi4$_iEt^QBzg&<)Y$YQzkz0B02OEyZX+~z0tz%5w-DZ}X!^T! z8)YvTLC+@ioI{D$oI6NuL9@0xw0b{C+bMKBPX7iPMh%bWf+6uV9?b%SXQeu&8%-MA z8wqb9l)Wa7E;ETE2(G1kF=dM>U92wUo-dMEMTu^KD^n5SmsHfzvXpumAo>8*$Z@7| z=(3L%$r(Y7nJqI8HzW;o(iIbaqEz0_bi6>*=d`8MG*XN#eN6u(;fIPQMi72L_&(u# zgzpl*L-;n~TZC^CzCrjp?f)fwjqp`MjmgV|g7`&KZi7gnW~dhk*AcE&D)$Mx525=d zdPdT7HocqDdpoJkseOa$x2Q97Wnm?qZ_&S=fdhxfa^a9z8ZT#|!7^M!_&nirgwGO6 zLRJ$#P52a{-b`{OrIt!Qszx(jBz%PMVZw(9A0&K$@P5Mk2=67lhwyH~y9n@ zie_by+Kkja^i8Ml0Wk`Aj5-tu=SkH;umgl`-QU z+V`XXCkFN(9>HCQL~yYi*v5sgF z(L$mHMDvN}5zQsqg=h}ZY$DyyIG1Q9(F~&LMAK9vzm`f>G?i!y(PSchSR+j8358-U zVGX59RufhcRuWdIMDE~jf|R+Ok|jzfw62vAmJ*gwtjxs}EuqAx*32F0W1x#p<6mc< zB$9okA^%V>Azwn*jdUMG_d1Gouk=iMH>USiQk%k4^i8EttM)#0>W8e-R4A)`cno(N z5<}zF>}D`5l_kA|D%lIB-|7ESr3+UQD4QsYD3d6ID4q5jiPDHh(0m@z1fuao`fNc!k0TmO zq^(88cO=?@XnP`UlPS3!(Y8dRh_)fp5u!L|Ga~mKk%IS#wp5rng-8~c9&Jvf-Lf0# zK9FvKw-G&OP^>ff1*9Y$t@KTy&zNy}B{XlKUT3ZU!4~d`gWp+3CMVk`Iz*FlZ ziOeFKcRVe7DNHv8cgFctjHakf(QIQ)XAjV@2X$(E58A3>E_HjDy#G+U2bEpa>_N3Y z-!Qx!m!@LMf2A#(%I|2}OuS6v0%nb&W@}oWCen+aOmq^_i9{z59Zz%|(Xm9w5FJf) z6w#5i-$Zl-(cwgg5gkf&2+_er>h?iI2NE4Xv_H{)MEk1ZjI}hIwJ*PhXcAr5(|rKl zYv?JY=X82EqFVOONK$uEok6pii3(1p_5=EVWMJRn<#6#(ILRNg%F%bB!g@z zk>o=h5VcRrKo45HI15^ojJX+*|K%2_3nwKa4rOy7adofT%D zD3Wa!=WJQYTEv&qRs-{>TT1N)YL}Y41Jo>~qKoRKR2fE&L&qx=XP8oE{>+;|)8>5n zkD^&S(z2TNZ;0L}k_f7U?-0FB^cK;ZL^`j!ndo(**NFBZT1m8mXgOW_K5_&d*VCcz zHg%V@kltHJZA|J;s$}ilL;ZX7-$VZo4D2&JmP>}j(s(mV24!+@qP>Wg5$#E2UMFLX zNdDDCH&Q9YZXmkeFsg}U0*P@Q(X~X^5Xp>I@+zV$iLM}$8Li}HM3)j>LZrP&B`+en zkmv#;<3T1QeV-70Op%)Ui0DJ252(|At_)cHMB!x0jjNY=f=ITp-Lv)i+@91MqcKYp zxX!dTD4O0+c^4h8(o{mVwt;gf*1D|?;8v4w2U?z?{aZ!zT)OvHnAAjX$%8!Lt1pqF0GtA$pnUC88IJULaaW zw3cWM(ep&l5j{)v4AE*LDdkf{PZB*r^f=LDM2`|ZLi8}vLqrb}JwS9n(S1bs65V6^ zsS55Ux{K&eqC037thW>0MszEYTGlqBaJ`8N16Y0)<$F@LC#B+*5tR~^5EavUASL1y z5q(MY1<~geEBTp9=m^xaLfcE9g354Fp)e0c2k91=U@ zwd^VER8M{-Q3K`bMm?Xt3+N5&zHA|ktgno=i8<1PlpmqFv;tb++;xytBME}w*G5m*UfT*9S zk0_-Q8P5{y?C>JuX~a{Brw~sj)_2Bx5lFch%=6OEb$o16*tRaeg*Lk zM0Ld5({(M~`-oRKQzSPKCD&3IG<7hCM zvW?rE-AlbmWTil-D$A%{MvY;l8?5rDQN4_+Wt99w$7{4DXetvsM?Xxr6{TI2Y5Q0U z`E#a%?TNFAvxqZkznxfkEhFBOcq#E7#7l@56YoyE8}Y8hi-;EzFCd;zJdb!TUDwdP zl5QiGS!+cyrqP>6?+v61s2QMd6Z-CiHT2&>+nEgPJv^3s4T+`kUG_3q#=8*DA)ZY< zi+ConewFKD;;o3cr1fCpEr>U#L@%)!@utL^P^|b!;*E(nA}*vz$pYeh;)HseYYrz0 z-XtCrNtsV8iF1i_#LeEEPGga$Z%2bMsxptI#!O_{2UBm-**%nPM%`Z2?nRAZq?si; zy_af}$k)fBdr>M0FQ?;mn#_i5j=tvaqEr)(z9BS5Ox9@PV~CF?K8pBA;v?v~n(h^J z->8R$HXYy6YbLAoV@c(cx|Pg7tM>-Rs6(&cNzaXB4t(zFd#x{9Nlj=IYA6UE<|BHPk_7x7cXPZB>t{5bJr z#E%j`Li{lCL&OgfKR|px@qNVi65m67H?fM`snQ9bSh}HkQR~8GVq`o`<#H;PQ@)%s zDMA%-rRhQ`aRqTXrQ($lm(nVIDj_bW>k8A0Lb{Dko+eTtu^e3L^(&eBQps4-1$)vU;4`UTjxPI18;=&r8W71oh&nSM z*GboiIaX>6t@MylA*d=y|ZDLFHXGx+ui93j8#H}E1BR0N6 zh9s_qSeBf4O~j4F2DC&2vCJy*>WS-!Yl&qUC|PZ~T0-dxT8+n;d#XsmYo^38RE?%> zo}%fasFyOxO=(mr^C*$*0}c6`s=v+iE~+-r@itA{DwWetyMEtw9m#Z(X(UGw|0{OE zOT_;W4-oeg_YrFc@^TuUqx)J~ja8J}O|L%K7p6I+Zlte}+I0G7P;FM60yE}0j}DLM zzC$8ve4Bj-5$z@JAvWG}-pj;Y#D5e2Mf@l6AH=^C|E4GDpVE8<$w^eMBsr1f1d`*$ z&UlgvF?LckUysL;97}QxN&SkN*gKHio}_+WQ}OLc8X)rJs2oAb3W`@yw1T8r-V^VU z%qE#dGLxj4`x8e~^fhg|I%!t<^lfO+x$g>^lC(8Zzk<3IVrL#Hl6`<7e^I-FniW+1 zL-h)(%+qu;2KhVFbs62u=)T5eJ4YnzIeJYKIb%s>Q>9g>fIg$`*}7(N3!U@n{2y*k z?KdQ%#@pG?AlfmgyE9BPZM3{VoiRBIU!dLidU-FA6dro_gydq9yOZ3FZ%xBW>i?wcQo8q~TXvCQ7k1FA|D@n2Qdy+Tnx3uC4O^*}t$s6gAJKNs@RGUz zkT@F8XMclZlYZlBCFL^hR;vC?{fw9iv-dYd67e;a|4_4%>XlTjq~l$&^CyX<|3%Yw z3UgX%(R{aolRS*1mY%Cf-b(Tok~fpQiR6tWZy8@eu`dnw&lQT&yj6i%Txjn*}!GO5yUwdd2Pix$)AzlnM?8ihr} zV|l=kSQ@|Q0E1=gQ6y#W835DoC%F|#vum1PL~;v~GSejkI?LRI25rskL(|vP??c=F zW9+P>~BZz!TLhs^7;i%l# zTrxAsoI_>?ndxMvkvW@;iqEv+!n5c&kftFy6B|*dZ5eIUYMlRwmd$BX=GeU`DdSt7un8`5FT{ z4{|+&g$&-J;M8PlR#5+moW<*pT_Y!rilLEpCK(JfwNxl0HDs#ERFM&3lh??`g$gJq zQ%0tgGKHy<3b!v5lm3E?NR+}pC-WJZPsxZx$@>YJkIAec^AQ;hCm)b`pUit?-X-%6 znYYQjMaMzr)n@)ZN9SIoo2-ErhmLj{4KhXAS#}cjtEo4>(=36q~kolU-S7cU_=_b=fMl4EQVh5RaGHql;pyX{K zV<=Cq-UJyDD1|kezwjs(CsQuf>eDDYnNrhDq_t9bGFBR^5UT(yg_Vz$hn0(!#LB@+ zm^@CS$#B;~)4e1%q{XF5ozY*Ec=R4d?-leNp_rmYblpzBv^>4cz%m9?4Bo`h&I~PQ zcwf2?92>#OqatV|p2Ory8N87ijo#NY zypLiEJ{ud)(?-K{UnBdRHc})1rb7z6ja*oGI!!0j;>nqis`?HZPo`O`_$O0;GIb*( ziU#r#`xk0gQ*$z1x;J18nmVYn^;Fu>&BU*?70AwSfiGy2u>KpJ4`LZcQ2acWs@h3d zCt{s|bv)K_SjS==gLO34QCLSRRly5bM_?U}br{y6SchO8jCBy!fmjF7Jr`?#to^X| z#o7mJZ>+s&U52$M)*e{9W9_DQb1~&YX*A9LezdG#^gGDb$}r-6_i8rEbi<#7_$L@dL~i=UHG@E@#u>7Ivm z57ymScVXR$bqCh%v|f*O8`iB@w@~+s`NC`&h0EyEc8+!B%zc*rm#F-KfomDeW$*@u zc4TN7b)QiF_1Jau^ik0?QqSolXx@x<6V{DbH()Kt5_>xpOGSMh*0oq7Zt`A@brsf? zSenm~cPZ8qld%V|7Gr7HSJ)z~g$m1g7;9tM>4jLLSt^W;umsEol*zq5)_Pd$Vy%NE z?yZouu+}umKZ7;s&@Q?4Xz^)2g0BBkVFroxHZl?_44Z}yIa3p$M0P&3X?1ZmoeyC> zi}eiF(^yYoJxTM;|&X6k23Wimwe;XUYGe$+w zNI7SWAov8<<5-VjJ&I+>Sb@#@8Ca)dO@VRLuS2b=_=Ts@bSm{pnzi_A>Ssb?g#u2c?o?_`K%SX4N$p7Q z!SpVcGuQMq$<1UGchLU={ZdtYHRZq1q~)Cn3|&Wyu7OI7jpLc4;%KCtGYyV4r_!{F zu1~36P0LoarIEIUws&Zfb`Zh&D%LAlFJrxg^&*z~FSRKlZ6GavmhQ`FT~37>T{Tt} zmX4%XXPZ(+TO^#<1KSg$Ez-iCCT`c^PPPbX9HEWHQOdpUikkIcJR zM)8jH|A+ozsx;0>yir@zICCw-ds6f1*!Z0_Dt<tw1N7G z)J>#zA~h3D7LGN>Pcm|UrDbcXR?C^*nkKcRBI-M+Hsee3TiV{G^AD9;`pH{=iy|^*fef0maYAD3DUo1+?Bk^+c)!U^5n2O;}P8D{li$JV4nbN+)5jg}o+rF}6BxdDGasOR0+91L<8vgEp%fikQBfev$Ga z2CiZ-!Qe86wrA)XhWB82`Pe8<9Ti0*(M&Zc+6CAt?0oDz^Q}*?bFq`yIoJv8aoB5M z{f+e()-cu()*xu>>0wHnZ`x#`%3%^Mx_WC8%||I9u`XSo$<8~*7*fuu3y}=)iJz$Y zi8jqTh(zg#eg{=Hora3!H_@q{#IUMjb!op~{fzY!Ru(J9im*bg0L#bnuw11oQ0-TD z>tNYf7FGu9N30*PzNc0F`YNpN==hu71L$2ypNNQ&=?kx*>n{2S8Mu-`ZKahm(`JUQ zW_WjouOA!5vqweI$TVk<^oieMeS`Hi)>l|7vAVIkutcLLVRc}&W3^$mVzp2;iAoi< zR%E34OO8#k#o5lo-Wc0R6S?hUSWX)iOC!%r8>x}UV4MF@{1o<4*hgX?fqgjkVc3UaAA)@__Ca(n!aflD0F#?% zu=mH_4|`wieXyl$$?XkDR)`GP{ zw!z*STMO25Z;8DH_IT{gX+K{d$zM)~5k}HlNsMSdhFbOD|E6S9n!ZtRUPR@{@VzB% zYssI8DPN5gg|&S^eLM9UpD&~DKq|ISWYGc{#V69=PpctX$(0OVOO?*VUd6B!&i;eG zar&qj8i{7QFw{Hq+GXT7P}4z$8roxYUygk*_B{$NcpCd|?7Oh<#MZWxo3U@lz76|U z>|3zq6@E8j-=MUGKVUD%R@Xfldl~k1*w_Ep$dVv8opyA*o~_F`-?BzYHM zFT`Fz`vurq?^4Ko?0MK5Vb9fzbCmYcKoCf7=nrX*vNnTa%O=+_2)-3VlIm(bFsrhWg9mhEV2p>1uGp?zhfwxe-F*~L#{ zKZvc{`UK1U*!R(Ti-Oa4QGFVA8+I$U`UJU~vB7S_R?C#P0lS{|S>}BY%gDWpG68rA z_Qlw9ur-S$?}ga2v1eh6Cdqp~w)&BCv1ekRgFOR#I`%Z|v$3aQpGA+EV=3B)-g%U5 zL7#50+JH7)iq^-#GzK&|aWxI1I9D*d3&Yorjo>+>B5356bN*Kc(;s1DW`~ zI0{>TZ8A-hX-HDuO7n3vPFAK0Po-fp^~RZ4i@M3wYQ||YU0=|$y~6UeXKWRX(mAOs zAPqZ9n^u&NKFkWrG$Cw|&eyKp4$5up7qMT!{tx!^*w0}>}aHD29`O5+La$FU#7 zepC+?%l8QO!`Kg*7k)vv;rRtm(W;ivPP>sz5_(#kNg4kP`%~=yQm*}I74*)fZ-4q6 zs!e8cpOTTIIZ$l}65n~BRx!n;3`^7aHDe<=b5taa%rbMNNBo2meg9+Z71+Y`L+lT* z-^YFr`(5mJu;0dh3;RvwA+5+auwTbEBrUDzSFvBgei{2EnogrddM8AgO#3N3g)+_b zo<{v?)SX7DwhWy{&1rOfNy`LkOzX-!(iovhJb|`#WT(f|2+~kl^Btu+r1B$G9h3?7 z9_&A{|G-|2{X6z=*uP@`g8ei0PuN-P7(2oau>*R_&71b4_fqO#qR%!TnIofERYE-H zYzBrHyo!>wsa!#gMwP3_#_-%xF*H)kx&Nz;#gCiMyi4~|*{PRky-jw3sze=jEp`nR z!nm4twW=y?%}UB!fnAPWhAkc~uMxy@)n|T*{RQ^_;f%vs1N(36zp%|rR$ApUM90zC zgV+Pu{n&ljy);cxWL^=?$J0242IDK#R9P$ans_q4gf`etp=JtQ<{Js^Xy3s+VMd(1 zfErT*a^}}jqB7{9)TXD5=3VK%gjO>iAHtXi-AD~ zucUoFhL$j_nf5=jXJqCg|aC) z>*1`6vks2wYjf`~#_g0UVlA9Caf)%orBA0sA!(dKoC2H_PCiZ^PA*OoCkH2ib2^T? zB*RkE@8g_Ai;)y^=F(G2T}bc6^zBEVMTw>0f<_rBO$+|SzyO0+FjPeAjtuK})z~%j z{84cCMHoWG4a{`9{Df zdIRS~oD*=4$2kt?Se#>Uj>b6($F$y}KX8t~IUMIOoI`OAq5DdlgK-X0Wbvao2jU!n zvp>#$IQ!!4gR?izUO0Q=?18hpQWZRovm4H?IJ@BNjI$GtD3k!)0cQfv_Bfh{l6PC2 zZE&{6*-8&3YjC!trvw(!JBL2$@PCCqBSRG~rtKPtR=XRXiaBju91?Og*n-pL0 z7|xA2H{dMC5oL!UGdpJ6if1YOdn(q}T$&ybmxOHN{V0>f8~jo++M@iTJFEQ6mTqNYsD z$C-yS7e})*BcHsD4pptwa3_t;Ts6M<*`@=z};9;M|XMpJIw1!MPXb9-O;z?!wV})0sFmls&7+aX-;} zA$|MM`W=0yc@@nwk2Evz90R>nzE8=Al&_%L^lhnsaI#_csE8VAX0}1J{SusNoGKiR zD9VQ6U`hEDta4W66qOU=7vjvunS~=NC-3=6m8*Jn9?rQqGjYyQK)$NY44mmW({Rql znTm53j^23&9mmsj294SnbOsHk#0tbvja_&Gb&B|vTH{NkX=|gaoSHLe*;xTO+DZ8h z4XbF|fJVV@I+T=|zfPM?8wpc3`PHE1MnU}?jzKWB0_SO*5yG>NNw71$BSh&u`gG3t zXY=5_GK#ll;8_NG7+k`TVQopNS=nB8>K~kOxNuZdjeK+ANIiWL=LsD3OJ>L^`rR02 zW~Nx#?8fPmvq1Hv6GziLLZ}_54W|`HWLw^59B`U&M7HH^z^T_0gN8V{EA~5O43XQq&~v=8RsXQEKZCQ;e(T1m|O%6?!`N z793^aLpn~x>B0FE=MS9KbnQdaSu|`x?^^U;K%H3S5A^9soeFF!1JBU)AcL1Pluwz) z8tp#)2mEKwsQ4NAW{#qNl_gLIxaF514N-;^(X{0`++iYhlQgM7Q zyZAwzRXE?_e2eo9&eu4qzbmOWUoOykzp>LdQ+XB@XVE^F^0RO^!rc&e16&E*@~(%w zF77(GTAYw~E!;J6i*bwS`9FHsr2RPh_NMQ9`m%aia%8o;o`I(s_>;<9nk%R^a&hVu zT6Y_}K3+U3o<_>K*x*^B9HemzaSL!$xCWlwn{o4Sb8(ZnIk*Yjaky*XXoc}4oWF2} zafWaPaRzYuar$t2soj{C->K3#t^=7HQaz3~ll8<&P*3X@a;D9iO+shQM}V({d^$8m zavko0iYWR8_W<1eareXB7k3}ry>a(aOz{J_d*beayF2b~xVz%+g1a;BPPn2(Q*n2| zoq%h)xq^r2k>L0o&41D-9l!PT8L;w9Us@@Ut=cj@o@2X*q2Q86@9 z&LsvzS42#i*cNvi+^uo9qWK0Y6|yDn7P#YaH^!hoS zmfg&Ybg1!H%1rN{cd#*R)WfuX2`L&>`wT%%mdi+efO`b);kbvz;HusU zgBx*gz+H|jB6l{f@}(MbE$%hAS1S(%4^pMzt8lNxy@F<)29kFv?h@R^xR=vDp9+O6 z!d-~F09Wiv-ubu!e=e@rmAseeO}RJHAtX=4JpuQ4y7r@KDoq;Br_!K3^;4;vO06lG z!eeObqN|#kskH1)y%xJ)u@(5E?+S83D^ zhQsL4d4}H^T*y$;JaY;|;z$1gy}6^JXJnbVLeIQPTf;^gRRt8#@i^|2xKH59@fhx- zN|7v+k$NBZ5!{DyMc}T-eGvBn-1~9w!@U>x9^AW?s_19jyKwKsy#x1l+}m((RdBKH z{JDvCofe&j`z7udxEJHj!Tmp6(J2LL4N=I>#+`*LMkV+8xaZ-Xi#rqd9NZbW({YvZ zY`rh{20BiqOZ!gGrbXJG&!*vQ>d&U`Y5{h{oX|LnFh?Gc~Z*WU26I+N4&eh8pcD)6SSZs8Km@Vm^Ej^{Xf! zG9NyHRx@3Y-pF_?YQ*4-IFMgXhai87?&b2O-jk6ml~ME)?l-t!<9>y^QUS$6yBoI) zw-Z-PNqDs5w&AwoiYdw4j0v8L7SEUxcR!9wQHEtDdC2j@f z3Mt1e!!5FDOK9Bnx?z40qfcp&Y(=<(^AnFs!1ZwrIZB%HO1+Eg;ySoCu7#Vy)ssKqevi9K@kM(4JKS&OEWVfO=~PXp`4%dt zE1*F8%rqUS^OCw@$qY$}bLrogexGvfXnThNliA#37#hdW3L0I8|3Rz1Y*ZYL40D;m zv3(&G)A2N1e2)7W?x(ooWh&xNa6iUff%_5ehq&s))vVvc)&F}3_ify_OlGulG70xh z+&6Gv$9)a=Roqu_U#9CoT6A7PBKAtkH&xogqiETaw#@`8RU@z;pwPS@X*KmXaUA7l zmL{QMD57IT0vYLR%4pIdi5u|_#XAJ=V7!BrEcvC3)H`^l=!(qPTYL}ArUNOskM>1) zo8WCsiPo(PX;$~R8-1n&%}vs8iYlc^@sNREXkUjRDT$PR_9{yM0h$X&MbpSG3k;f` z_K671jqo<=Uzj6OE|3Dq;FzyiUpnUn4;|}2VeCIwzIC^Jl=u$|HwlqgI; zVLc7+Y`m#>XW^Ym$7H-S@J`2@f_EC8S)<5Zh9|%$;Z4Ll74H{4zGLUNkChMow8I+{`~bl9s(`)6`)Lt%hyo?QaYtk|mC%(zLVm2F4@7Plxx- zTuV-ijQmo%Z^koCF!_ay)Z2LbQT6s6{SR@`8Kt7T}31DQrI8JiNJh zmntT`j538?f_E|A96XKn=O{R*liqwv64Ypn{+j;1>367CJsfGpF{N6f{)=WE_m$3% zPsYac@=@_La?0f+_3%Qx*?6&|;=Y z3Xh;|JQYKfz?WuW2c8*zO&o7}#HS#gMN9n!=?_s=$ z@E*i_0PlXh`|uvYyBF^srA_{yjMQ6rcjMiKcPHK*ctTqZ;Z{67tYW-LNs2-|b(1&X zEyok76`ITNuEV=lk;QuQ8hSpWH;>+r>D!swm#KS=ev$Sk6`yyWNmS3^zZo26NTYQ* z!yk{0=i*WEG&0NL5j?M^`3}7~p*t#;82fojl&|yg&cizwZzi58_T3U4J|H=dbr zDhlvA@kFDP<956@yjFTXR7_4gCDNoK1n^pQEZ_0$JMa6f@$a_IXfyy4dCOmOC#Wv8cHdv2WhbIoFkQ%&d zyed31(^GIaP&NYpx|$1twdr_{rkPZ=QuZU=s<_Io5cTmq zdZaHdSC8fFAtSvPeP7Y9D+u18POaRf`!xn9GC0VPw49Yu@(;%7rK4hKc3)-*1ACJE|zQk>X zY>K}L{>Jzk{pH;V-^}|J+=agZz9@hy`lopR#S^&`upi^C!21aAL%a|0-p6|n?_Ioi z@ZQE#Lsw;d1MhXb*YIA&dj;=hyqEA^)LV0}!FvJkKXe_A_dMQnG|r^weR`AhuApy6 z`fjFw5Bf7SAFSlb_34<&;HeA_F!T;Xr40Xrvv608ilC8Vt`LIeziBbk2o8BUwCqQP z*h3p-hC1h|&1mg@9eh)q$xmezc{Ce=ulRP_uasSIC;oc)>*6bw7?BWL3x7@gVtg?o zdDHlX_yzbW{Cxa8{9ODbehz*De;ocAcz@&lg*S{hgg1ycfY*=Lhu4eOgZHPtlY2E? zN8tT|w;Jzvyx;JC#rp;CXS|>AvUs|}>Rjs2rN$IjkpRr0_apkGCGI9_U!zr*?Whhq zCeKw2p2DD(WOR?)zZm{#?4I$;Q86^~%ax`U)}2f3x#rEMQd30DxitJPU!hd6Z3XSL zR>_xV#+SrVbdIBT6?LW~O01$x3&g+Dr02&gAZglPjwy?L?WpLbQTQE(e<=PT_y^-3 zgnuCZ0r>ml?}xuH{yzA7EE6H9~GRt2Blxg zE}Tc#!wmK@^cKyv41YK_j#rI}qmf^(`d?iv{1pEL^9}up#D9<}Gur5Y=xg6Gj<=u+XKv~3L?w3@ZRyuIm6$W9z7Be_2{W<*Jw z4HAj(C=v9!*P?}nb!h3M(TtlpjWY7hFrEGj->{|Re`Taz!`}^mSNvV@cgEide@A@r z+Vk)y;BT)aMJ~}BXphI1JHU~EjU9u-p~#a#VAOp9-$Lf@ZH`T3N~F%N$(zUH*f$G-&s zVtupV4$91*FSs3l4!-!B;xsp-)U)wt;cIS2A<}%V5dB`j{}2B2_|M@>SU(c~8T_a9 zf?U%nq}HTTr`|-Pn`v7`i{Yh(htQ@SrbD#W(3wNo0Ck36C8SSnq(s&>o^NT?ot+~c zcoF_X_z&VgfPX*!efWa*9{juU)fJnuvOv|)rT2aMwx{m~`gf!M2Wr&DwTn)ZZ#UDh ziBjdRq{>K?sXfL<@|w|*JkZEB*O*rB-+?cDZo|J7{}%k4@eNf=eqvt#D*lc5H{dVF z7he-t%kZzmH+(JeF}{|R6?V0JMGn4dxAu5kgD;A$;HCIW@E7A>uK40xsZiJ={Dt@n z@GqlFRsV7P$M7G;e+2(wN;5Q^Ppwp`pHD>_b#k9irMQ%C7CxW45iWHwZQDX8<#}`_ z!Mr+eFO!?YsTzTNE#vuyp8wD*#p3VLw;g@U>ED%3Ewb*!fMuRoS4RGG4EE6V1j7l2 z-ya*tYe&V=$T`;<9Lp{0#?y2@Ez-+#zTTf6#<%b@_&?(Rfd4)ID*W&8zs3Is|7-lO z@K@q@>T`X~qZN^rS^L?bqUqN-4MjzaGC1 zUpz|Q8hq176ss_+@U`x%unPQg{4#v;D0xfpzr_CnUp(ppr7ckLe~$kdz7)f&S9*$` z=PAq3`>x59W+twue;4|{XJAL_-=Wnv!6AcxGISin;~0K#Z2Ybp6+a`_T=%~goqGlT zllV_irx_XD72iV3fmCYX?xH-O&Rkeby%|u_x>G{uE3^$vJnjNo252;YB0WU&{j^_4 z#Vq-XZ>D?}WwR(Xz5-QlO&ZOjMF0L1eAV3*_#feai2nip`}ptSzl;A4{@eI(;lGLh z2L9{#ui>i|s9avge+gd~C>R1(Xu8ter8Jlir;4ecMZMPjW>I$m+ zS=0!Zg9x@E*qUG~f-MQQAQ(@uIl*QGn-XZTRor+xf{h9OMX(XUh6EcBtWTf|Y`O{7 zC0K{nS~~Nk(%3-h@09*R%OJjo@8YZ7+loxCkdb*MHa!? zbe>7qvGV1bm!*_Z5i%Xu{*e|nYBSUn9t10?$Rk*rU@d|*35p4{8u%;?T33BrN%AL9 z{W3L@3Rh8`qky~=W$kpF&){l?H1DY8>37CP@%m9wG&0Qf2E`yvP)JZfkfJl6KtExy zlpvQNNsvR3FrWUKU>w03#<>Un-}ryYnK%qzEq@4K5Dnn>!4Q{^peags*X1gVHbfKjZ&Ii(ZkXQ6VvYgdgGuG{|cjUwV-0*|L)#(fj~a zvuR&Or3om?P%)cw<1D_3GGnJ4g4vYLrt2EIexUa)`U>c~j((jD{*HkO3~1s-hZ;0O z|4!3KH0rLnx5vhC`KTBg8D{wihO-DRAUL03WC*^6lGz0363iqxhhPT5bb@Jgo<-Mj z1cwkDOrZLDAb~F6QA)#kQYwc12z1q(0`?);n_w@3Jqh-pRw297)lRS*Z5aZy$elF& zC+A{n6t@e(&ICIV>`1T!!32Ws2~^j&rOu#Us7r8$sL-^HbhD-?A5d7HVMPg@#~$fo zifJ;~B@89Z-;9=F8fVjBf^uJ{=Wp7dq%S33T7t|@^nXkBTQm&PX}a>_TADQNcr>-T zFYc|ek-TA4B#kU{!w8c15!_2~55e68M#@W@>?S{?`9a#Rm$T?c^U)h+q-+@lHxo>i zoxGpmEP^u$&LB9QU<$!$1d|C=!6y++Bsi7e1%guuB&!`ya5BO31Sb(ZM{pv+vjis) zJVT(UrwLR{PZ1nT@Fc-81WyngP4GCuQ3Q_>97*sf!4U+H5FD6hN;Q>^tX@5nBDuRZZE2teIaEH)1QVoifCb3dRzP3en z5g3Y=?kBi|;C6!B2yP{~g+PekL~tX)4Ft;xM3pWiSVnLifiCxWh~OFmDe5vrIQapA z^c*PFl>{P6N~n%y34s99M5Mfn2o@?~(GLU*2s9C?u=xb@2<8%q;L3Z6;*0gIiwWis zTqIx0qSU;;ct5(9(WU)*dGuXF{|*YvO;9Zc`2%&P+e+RiW1N($9mUXJwEcq_w3|jn z)5teBjnvNzDKVcexQWiG1g{XhOz;wchRPGD)||6VSBLrV2sO*6(n#Tj`-83;*3GMh za3xI!?Y!M&By?iRye*xk4-!nLY&f^td@Eb!%GBnKa8)MhZn@7dc$Tc_ruVxl~ zPr0&DK~PRmMj*l_ZwbMd1ZEbzST#qiO<|uCj7$S%2tFkcyHVID1RoQavADqW2MKM~ zG%|1gQW=Gcc~Ew)D%b-A_fvl%bt=0IwQ@+pzmT>as8VzIh7R4Qm4>Tms;A{|0*gSO z`H?`C@_U*eq20W`u#3*K3Em-io8T>iHwoS#c%9%if>#xvypyhz%ok04F5br&-%+KT zYI3QP9=QqBiVdkteuwIJX$?*MA)w`lA%iFWjR-d++<;I!bH62Ak8oY;Bf@nE*Ct$xQ12`z)EeRXgi>y*6?`>*^Y`+0rJ;eQ zi)dYgMw79$w2o@a+bSAVcpiaE;1HNzJl#w69GV}cY7Xr;DWJeGh2p*FeVx7}eOFN> z7G%N_+F`be!R@L4K)(E^89IU?snTs38^K#gMbOAJw;BX1=jesW_o$ddxp5Y0YG)3m z##wy5jFcI8(l=0|PUs_o4++${y|1tWwX*l-h{Qv5olKici8=%AQyQXFP?_#L zS2IUKO5mml^Qjmh%p=StOcLf0CJ4t7u0fz0_!q%2!4Sb9!2p5!;6A+|Zx@1If*$jg zg9!d4_=8|I!S4jW5r|MJXTK2qOzSv;p9n;#OzsP%=l(W&U!yOFJ|o-8esUfv5~xOR3wdbbK6KC854vAk;$i_2>Dw1Rdz(^ltJC+WiplS zq55K)AEWAG+O>1eOkpJ7rNY>W4`rmDq}+IlzN747LjC3tsw!(rL+;sxvj{IxeDN|u zO=>9YJVGO6q<$crNq7$748rMz(+JNdG#sH|Ii1t#Iz`Uh1;)6HP<%^|?nAgY;a-G$ znir&~RoEVch9srb21L6&!d(e>q2^-R=FoeqiCK%j1brITR5vw8@fHK$GAM!ldAi@G zQES-!48J-ynzxUNrjc@PH)w`C6YfN~BjFB|w&}Y`9bMQ-cG14N!&G+Re#Y3Bjtykz znnn9@(vo|P?7W?2B&5seVpl-}DNp{tsMkNei||guI|y$lyp8Zy!dnP$ zCcKHzbbRSQX?~pcTjWfdg_OLPWfU9ETJ)`PT_+=@+eroYS%hcGS8zSy8HA@3P9Z#v za5AC($0Wjube&45-zkJA6P`qPBH;;y#}gi>=kj!X!F(qDH+|#iTS`Tae2HnsIE#UA z7~F=zALy9HkgiPlis4trM)!_U(KXV~9V6%-OLz?7(S%149!Yov;o*db5gtl-2;sqm z2NB*!cp%}uga;7bL%2WT-Gm19-1*d9LhU7#b<3B^labW!`<5g;amsMYW`a~B_m-7L%!jF1w%55kC##OJmC$p(|-^yClsK|sJ?{eCkU@2 zyp~Y2QI`;2O?VaIm4vE=@-8J@Lb#Yv^= zq4W|;jI-FxuccPesTTW&oCRvouM@sT_$uKmgfA1mMAt;Z7YSb={12g^d5-W|!e
U>I7DBshuA)RYcIY6s)Rp{9^ z!d5~Nr%MT&2_bAE6k)oQ#=lLD3@1(v(x)qwme9H}{ns(DHMQ?BxHWZV94g*KM*fow z^)md@*a+S=DuPA^x@!c%2Euy6Izq8DWvPa+ny`w{u(ae`lq;-)u$-`rP?SmD62dPD zzaSJ_lJ|2ZEc%-8Gr~^^#g-KI3E{_thAkCeL--NlhlC#xnh`AZ9ic|BnS}2WzC)bv)G)BGepB%`M!UssTFJmNU$3K9>H0Tq0ns)@TN70>9jfwt6 zv=PyUL>o{uM6^E9dPM6ItwXdn(OOgw5Uok1?LJCbM3g2fq%I;VphO3!{?xnkcBDy% zI2&lZl%{8CyHsJtU(@?Hts7CMmaPjaH3n%<%T^4I3=F!j=OTuB7=CeV{O%bQKO^_t zGg2ob!jLc^^a(v0E~Q0m#U*s8YNt+U+SIF-89_IlN0^~nNAd;lkAy$a{4~{bO^mjR z&!uuM71|?j0+Mf1HkVT4Ec%KPV;5gd=XtVIDwQ{77nuGbc@K4$Qmdy{QhOMw1%EY7+zXn zI`87$>HUkocj%u(Re}m3DOCYmDkg7+NmFXlRM4;y!!L}D=Dnk$X(XR}N6`G6@Grt) z!Xd&z!U4j5N|JrtPlV!l3X7@V zl!m#qBxusbbq28~LpY@dq9mnB+Fz&1pqe*9MnXF<=Fy;%m`95a(#@mJ#3fo~zQ|J~QQoe1r6{U8e-SCGzEu|!>7c(<|B;Q1(O-U!D*I0**XUbWq8!SBYLBdYMQ}NWi~H^a9a;6kniv z|2&aC@+{FaL{AevMf4=m6N)VUFVW*fj}fVC9wB;|=pmv9i5?)jpXffKdx`ELx|`@O zqC1K1AiAB}`7~&>H~GrFgen7fY8-WWH2x-Mj@Io}E6<^Qgr&};%cE3d&s7Xu%D@#2 zZf??_#*iVH1z*ZYzd-drxTo{MQE@a<(1XHJ?@6ZMV35-4O(`|4sc;X<)B@k6X+Bjy zQ2h;!Cew+1Xvw8vKFtC~l~D+({)!(9`lUomh!zuxAIn=!R7Ipk!^?;&h{}n|h(xI5 zEg|}n=nEpV_LY2)9rg|jXy0_D`tSnGhIcIwywQN|1A2iWZ)78mNK{*wc4M0 zh>4soBmXfvpBlSXJ~S$tM(TOU)XKIj)#Ir#-_N~RMoL#}=F_&PoXLC|euG1(m2!p6 z=o~?McUp{$kdj6({qdiPej>^e#dJNb;JodLA{sBFri&;f3W$6KC)9SoA^Ms~m3Spl zH<98y6*I1pNE4TORXb4|Q7cgkQ8QHvfvAZ{pX05j5i|QP%lqv9`R7(4ZtB@pLjjub%~`l>Nn!GiPs`tlk#VlHQE@a9 z&m#uMD(y1)nRr*?U8q?=yfg7mlzvaVBk>Nz6DV6rygl)D#M=^YLwz3c*2G&8Z%Mp` zGMWE3@p$6Ri8mwGddptaX#{ua*qpNUiTLcnU>LHT% z52DpXzZ3mN^egRe)1*#Yd-XS?(Lk8DEe*y_>_zK0JDfad>Dy^yMfRO;ZyLh95v z8a|S1gn{Cf^u1318dPeGnZv*m^XwD`SJIGT=s}Z0`~MCd8@)$IMbF4KkB-#Bg~S5w zGGc8Il6xNUT;fZKH7_CW#l&-nFCrG3lXo`pEaD4@#pdKSj4b&w@wvn^iO(U{&T1u} zPCSkHY+^HUR@6>>7O|GJ&y+9KC8J=ejI`;qidWEmhY2xEC1)!cxpOG_o_bTS^R?)* zH%Q&+pYkPj)OkF0W*jLpeNE2k#45>L;v{hnwGnZGc${JiRTXrRbuG0PJ>Stgl}6Jh zig%^&HJWr&=Mn}kVqh^9VnE&SA|VrpQSmgg&SM78_IGGnNS98PFQj3i zUYoa#jD&W-c_Wqg0@@bRr#p=Q zrhhR57cy`;gPTy&PD6r*6?96$#ld4^`1q(88VTp|kvh2_@xH|S5bsUA7u|Ofn-b4g zWz_jP18UBeGIC9Sls}UOGj*0fOxr$iINDAvqz;milsW{{zH=v0uCe}Ix_%?RiTFn1 z8;F#Lp8yM{LH2!aCw- zh@U2Yiug(5Cx{9nlW z^j*%tY$~KicVh-k8%h0}p$BB=+(yGcIFs|lXo&7*q@5>(sNPfjA@P00_Y&Vjd^ho3 z#Det>;@gRDQ+(kE#J3XPLVPndi_C|&pux1lT(icQe-33W)D{@$5FCNFSP9@9%FPg; zV??K-J!q+uT_`I39!=6%zlb_*tz1OaH`JRqCib9i5w(kmyNSDqJ82fY8p|&yUPQ$r z$`?_#h|)!rTuzCSeN4Q9_#-;aY(-M#p#SiJajHDtCw`CkUE+7BeU^6V`5QDJm*{pY zHO=%dVqg{nix~VDHR3?qF?7G;3(VwJ`uVYI=#!)3Y2=(IM{4NX#BULsDTbmJx=gQ~ zUMwTkE~CJ-*|e!v#UIdp7d4mDm`7zZrDhQ--w?Lk*|hCzJW|MbBy~e_CKD#SpAJ*C zi$t30Wf$&D{5$b)wCn%=N-X8J0_$hupNO->;!wZSs1S92A#p(L(;%-$>=HY~8uaBg z^D22;6K9BjB>q9Z#O^e!cDEC^5w{X+`cmndi6L$xZlvdHn*X3vwBjV%7SlIO>t>X` zMTN#SO%io7v@NYao19Li`yZ@~KQ$_zM$&m|1kVO4m4SL%=IWm#)3f;QzQHrupplv_uHLeF_ z*CV?w*>$K`pX}OX*CM+n73+{K)*F-K$QF@JlPx4$KsH4-pRDO&igqKbYn7#{F{1qg zx>hSJZ!6;Osk@xo2I5u3-w}UH^Dc_XEuwc7eG}y@)KIQHG#bY*U|=DG8!_0y&^8R+ zXCgnMW8T>H@##_VGZN0z2ESVU{cniBCjN?eCC#r;y_l-SRBDi0Ou4C=1tVkNL6j|~ zbTK81=`;gy()4RZ%`#GLCXs2UX*1RozfX2=vU`!;lkR)S?m>2UvVve&vb&Jone0wv zcO<(5jlyRF+3odE&Uji*B)c8iZOLv!c5AX*k=>H)7L=}3eExK@GLb~Cb@Dj+Y9 z>?UM4Ci^e4(zbCFSwo5QRZbz<4X9k7p06lvr1v{2N7_~keS`EbWZ--T7BILWgY8sl z57E60n?67Pk+HFSW>hSVeDjRKGOJ@2ThY=;+(+C?+(Y~)@gKyiY5$O>#Wac1Ev7+x zEDf%CTT*B2#BS6sre-nCuTs5)Dnr;kfvkrnNd zcOKcfWG^Lq3E7J&Q^*{$7m*e1l6yAUS!6FDE7~RRd1TKet8+Z_$(}=Y23g@cjqKTE zr;;_Daq@YxXOcaGo|ROtONki<^Gq*Vq~Uvj{sj!2$AG33-lbZ-qxQ?c%rq9!K_Avd7TVO|P^e8a|Y|fc8`A>!<%RN;Ka)pTQ0I|HjTcOp0Rr`?|AvXAPLH zk*i*F&N=6N%{k{Bu84{XC@Lr@C{a{IL_o#t#l+^k``Mk1CJgvPF-+&~2^-}7*4+2c zeV+As<~dz;s;0a8+o!6|sc>qblR=5ji3}a8EafXlM)TES(KNQrt2tRTx>|7uNmY#F zNFGb_7?MYmJc{IzB#%&HiKNH~ewl>jFp`IoJcQ)IBo88apecj=D4J%bD4a$`)RqK=KE-hifU$d4Ars+$IxK(sJs%= z-_o8{u*h9iqdif?JBf4N%M2)?(z#!Tqyyla9iR3(z zb4ku2X|%QCc9OG5&LsIF$rnh@AUU1n^CWZNNs6YK;@wDcD#>R_K0|T}$;tF7$J0tF zQVyd#Rw~t&G#Wc9Pgd_3+I7f{*8ldBK8W`H z$VfC(WOO{;ozw`oGuRDF%!4M4jOA;?Vrguh*9?~R59mJSsy_*m(jXon zDHI{8I;1g}2k5znqyTw`lkd`F`=- zfTT~?CnP;;9BL#lTPjQ8wzNxD`)GK8h88;23rXa9NXpIWSZc>oGnVSHRE?!_EEQwv zG1`9qe37z;%o9upSKNkm2-d+^2Vt3Jxa1D3156g3()t3{{#g5A?TfV!*4|iqVHuUN z{B0~#$dc=@cE{QcYgepYuy!WZMfw2JUz0hVOsFh{I@A1i`mdl;t>h>YGHc?s zV*P{eudueD;wv~AYjYK(=pwAmur|fo1Zx!5##kF+ZHT2UVhWd=EOJpM>-@j;W@uBJ zHdegCZD@a+hOu<|iskK2yu#*Ou%t05EJSa|x(!PMRf^t%bu-pYST|y64Zk?o zV+lxUOb=533RPQ>HZ`iqG`{(2iN}!Dq{HP@2*I1gE3i$%xzv9`Z8Jm9jf~(M!y;&G znm2L?UPI4+u&&0s3hPR&E3hudx(w@5EHf~cr_X&cmJqrS>jJFvu}tTi|B^`DSklF< z^u4HSiLqqLa(%W5mA^&D*}BF|MU8Dqz54G{6fC$}q)0v{cQRKaC9wglJgoKUNMNl; zave$aE<-d(rUyw5knAU!C7B_arfUUlW9ikNtg*D@K0+tW67@f58cX{-bOzK3)v;I) zV5tXskW?q>{q>l#189_&R-+XayiE31x)#$fRL#Ia-oHug%3y-V6(szTFOlbt={=SQ68du*|?%yiugI73&17*z!b67-; zwezMywE01r9;E6I8s+yxW0!Fn9@Ik%w-(9M5>$P1X>ku>JxJ|C)M)znA*ysNrZ}Hr zeM;hKtQA=5P{vRp<|kMmV||46A=U?2?_<4(^)A+Otaq>^@^9(JWfGxjSW~f{#d?Om zSy)rBCS&PlPhmZ&EG4&^eAi%2!g>Phajc106R^f(J%(j8r1GV7=o3GJH4f_`Fr8Cu z3sR=BFFS|^^**Q4bBM{gE4_c}S~Qyew;8yBu2Bqb&7d6Ctzc-{$WpmvSPYHLv&3MC zj^g}A_b<58JVf5AjV)Z{)6u1CEGrVo+M98%pb-As|`!~l){p|W~?SG4Us6?fK`uGhb0X_t0gAWV-&JH)r3V@ zuVXF5GAdR6T&x9HuVSgDyo@y;>m{stSmMu7dR+IJjirw;6YE8+7qDhvO()e(dSB8y zMeP(axrX=+x?ZQPoL-x5In&Z=kw2-EvldON{EZ%b>98moTW4vmZaC`G@;`UdN3ERBIk=+ucn zOlO4EizQ#>%FvBvK1!j+tpC93#8S&>1SW46tahv#tZJ+(EGbdSU4gX{>q{&tQHp*} zr_Pt1LXSHBhxMoY9D4N%6^#=qZlcd{O3e64+0}HMr)#-(_`RZQ?0nKaWONw1v71Vu z^Fgv{`ro4RJu2U)M%s{kX-OWZjx3MMhDFlYILi!@-P%aqPj$NldswSkl22{TWLRF5c6h@^_-r z09oRbct+P^d6+Tr!ucYl--+bU7KztkA3+Klby`a|nUlmTFmF~mhCZFZl%mRJKs#8~ zQf|dyn^KC;VCdPAaeZr8T#dc+mci9N9Q!crL$MFRJ{bET>;vhWi)~c9yl2gG8nO4s z-Vb|U?0vBJ#@-8iPwYLESaLJ=?z)y;g}od0u5_G_y$klv*gIkW3)|R#1t_BPmCV{e5mOM6r7e_(5{>~w6SW5qVd-V9spO|VB{Z;ZVW_J&F+xEy-} zy7jD_qIE8I9J>s=6uShwn7VvYZKSn$P16u3lK5W#iQh%{MRKD~zjirZN3-d}@>2}9 zQm1uS&y0-W+rwgLET6Xxh80?_=;)rM|DyeUI#bk|FI=jT<#AMxqiP)XP1rYLONo1g z#ItlhjeR}#b=cR^^C0#$R6K%xHMUT^68j2lz2#-tmttRneKGb$*cW17fPKDhT=o>U zsfL9wVe9?%4bQ)ZLDk2ziC@*uG-_s+9Pv@L=c={}#HBWc$*sHF^COM4ZATQbEESqVQX+wjC-)}#=Z+%sOVZV;O5c@RjQ?XCMJ{kKYY@=G`&%{0f`*`f*u#d$)2K#92 zqp*)uV!X!WFgm`zP6m{x`)~-(lcA2DM>FX3jqunmn>3zB??A z#?pB=*Dk(|E!kOyy%c*1wwkcTro{8G-=I%5Z<1ogb*3&vL6Rde|y9T&sdbZf7}p44lp z>Xvj>(5FL+Kc=%ETUwP$qBgGwTeG(ccVTy8C$KdNsAxMj*lpOFN0e7e_0mQaDLIwQ zSM(gFV7^=_2J~xH(R&Omr&_vC6SeAJo*o&!_lHH#*fZ}N^lHZI{-t>$c0W zmQXdG%JC$gqrxx>=a|$g`iyRo_q0fHEgeS8$}Xe22hO4UN2+8ZXeCxo1TR)v(OFc< zX}ReXi!AIR>_O}SY@w3H&S0l$UW=W=j<7@QoV^!d%N4{1*dDfvZF;}>j{9sr#jb-yeP8L^V|AZ}FTTlN1`+MwF*wVEX)tJ4Sp>MFi zrc)4nMM6iGzl?JL&i*+2;p~gE56<2=d*SSfvj>i8Li3(7<*lLPQk>mzcE#BRXJ?$9 zaQ=m}BhC&u+bc_%(*KDw8fQD4ZE?22*&1gnoGo$wK}`o0KjLhGvpLRYIP%QagR=?F zD4dOPHp1Bu#~2K;QFQD1oaVZ!;+8?iy(-VCZk0 z_5R_oxEWjKLxYWD}A6U8tF$Yw4RLrsD{$OK>j6xd`V%oC|Qy z$2kw@TpShW9GtUp&ce~7&(NO=XXBiXW7@L9SvaTSoPu*Q&Pg~Y;+%kUJkD`A$KV`| za}>^zIMTQqNbg3vjm#7p7Lz>%=IRFJ(EmCES1|AvgPV$xf3`^MQilG<48upm;%F?H zk8(I3j&m5!p*V-&98BLUI0xYzsQbqaJn;%TF4Hy7=nuu!B4wIZ(R8?Wz4Xw%w-~XU zu670;-_pAkjix3S$u@v}s^?#Wb2ZLYI9K9afpa;|W%RvDhXQ)B$8bz%oHq%_s71w9IOA|0#(4MdOPM~cII{%_U(NgLsP^Z%oCQvhx>WMhjI4YF+ z{zX!Ab|KwD(;H;E#VEXq_QU9Voeueuy^MjSRDCFRe!$Q+3_US&d-%z)I2!BalN^pZ z0!1ZQiSwmyTlxl$^tBgobbf@`pW%FpvjRs-nxdcJe2nuE&WAW3;JlCX9?rWs%W>Yp zc^l^~oMkvmahBk`iL)3-H(Z4CI?h6z*Kih?SDGnOF0_x;HLfH7^vNcGOnU7x*_YmJ z4Eqp6_NR3o%&X8_i zbQ;be&Hzq7P8KJFlg812$OiNk(*7A;>(TQX?QtBfxY8>&<22zkl9)-KMy#&H-AcD9 zyAW4zbOEmBbp8)_3*60dH=}z$B^DM?*-r1aG~_z7Q_W+4r>h3X$MJAn9EZLl>hxsI z%IYB&P7+57j>3N^d(6i99p^V%dvMl@5!YHnePOw-+L?3{nWxEgl0A`ZuM$hO*k=I) z7c;P!K@F1rNsTnc2_xh8>9F`2tL4*NUH%p47o0UXQm$08pK(-ue!`KqquyXGO=A9l z^F7Wg94+hmoJMiJrE7h9R6oDL`5H%!mzaIDm*e!}=!K*bi_?YEiIc#QN~~x*4mfQ% ztvC%-KS9+KBxX@*1glgFke{IM8U+hqFlgwUT2lw}9v3ODFzn4}(yFBWX)L8zYplK( zujmv*exqtb`ig0L0`~;cn~~m$v?}sbWD>M3p<$VF#c!qaF$OMT;0?MrqFK(CjL}+f z!N?eXHY|q5R{1O^gU90@hkGoodcID;PU9YpdlYS*xJS|=<`KAN+9bXm?qRq(6u5_G zu@AvL822FD18Gw90Nnj?jdbPfCB@hmR~zZHcu9&-9zCz)?v1+_?w&L#RkPw7;O>sQ z8?JnCin9yu&bT|_>V#-Tcf{QRcY9npmREE%2{Uk5x{$tWbuFA>O7uA$SL>Q*YI(6v zy17u-vI|IWO8Q@<8^}CKri1JWWP9j2#$>WY^7mt45rZ4iUeC}!4ENNLQT%*Z6phXD zxk0ge4eqA6o8XSZy$N??+#7K>!o7iRA+sT_+~t+x>YcB{EyukUH;#J^ZW->?xJtST zw**(!w;1;d+#=k|am|=$On_J|)X8Bb=7ttEI)Bfjxrdc0RX$KpPKI|et`1sJWR_)FaX;NFLOFYZ0Kcf&=vcj4Yi_b<42;NGrWWdiax+*@&P z!PRyKopOI5)y?$&6ZbT_>TplRJq6cbQuM1x%s^dm60UwH(l&{-Jn!yE+Kfq*X)%wM zGapB`i~f_uDAxw<3m90);D!t?rCM!ahx;z>a@==t-^P6lcNy+d+$Feg z;x5L019uUwT7-qT`ncEQ>hG`OzJmKQRq`ac1L-<4Y6#Q|8>J~vNArA5-(m*j>3SxW z8f{p@kaqI_jrr)6!{TP_la)E?n~(bv?mRuE@OhD<$LKIEO#D;x-0>pCU(jv7ahZ;9 zE~PQ2HQY#zX@84O!aWoB4BVWaydHfz-^A#Z#s3m1RP`F7QPWmWQm5ZnRIQ~}&S3>z zJ+7pvmV|k5>8teJV6LjpLVOPHY}{G6GjU(UH62RfbGS2bRRf;KRd^ciRNQBApTV7i zI~n(B+^29g@?)w>sp)d!D@5{4SX7O*QejYSDb=&eWK|qQ;t0G#yaK#@yck{{z1!ohkGCG~I^4f- zhj0gR2XOmwv$z@DG;Rtv!VPtsq7!igTrKnwXdbRhpBhcgRq1`Dofw@jb`~wGX?>EG zTul4n%{9IaO;1vlrhO${1@tVY@k!dHe>_RG{##ArC90mH@+m5wGR2)j-;KKFzd*1_>kD-1UZEx!jB@GOm%fJE#%M~l=qvaQBJ{uXs z%3(1ymP(~C)UAq6psT^$(!n)7SJ5vbF$r7}_fK3cchYP8j{6&~%rAw1#r*|$4X*Sd zMSnKg|G-_1`xEYuip964NePm(?{QbrUO`tO?sxRONu&7R;(mksHSSkb3bs)+h}nnR zi`#=MwMfw}+)i9MUKK~0kvQ$R;I`pvid)eZ+-BS++(v43pS9G8XiE#B^UyWvS45^onO(s(=L?Sy9kU%y z)_8i&EvZ)cA9z}6_!SjvNxZD*mKu`xg-G%1^xdRu;Z!HQbpC3qL(U4(Zb-UWE) z72;G%M4z?;39@LVW@Uw6l;b>(bz3DITWQL2&xnCj>kI=&vbD`|7XY=k=Pk{ z$IxozHFi4Q(Rg}wsp9&Fn$pAZ4#Shd1mQ+`i!iUXAJvUyvcaF?NfM9;!VPPf{vT;9><%AHvw-v zp6N|WtMMMidjxMB-otng;XQ~q7ViPPF-nhrY)FiR8cemZn~`azMZJwCq>rUP*N({; zlJDbGO)M~Co%Rn{|qbS#ao<_@LnhkQX-)Ndlr7Cj? zJvp;Xs~soPz9|ipsh>=p$&}wk?PO}S_;@mjSH&p#K%_j4_XSl`@T5tJs-5{1Po*(U zPeC;m;(dbmG2TabAL4z0_decxq~29}>Gq_ns8&61B6|dN%V>R@HuD*a|6t%H2G65+ z9z&xTsva51x?zztmP}m^$#?OVb{2_aXbb68J9s)>&3LEbol2h>{w+F=w#l?krb?frlpa&u`~yYS z*ZT5F^tI6;e|=|&k@t{D@hGa*|7db<5!ErWhm+O7_c8PvmASl$0eSE`mqE1wdFlt?tq+c77*b6J8^p)~btF zPii?0Kaj4VQnJ`c_As(-%2=pfaevY$XE>VO|2KoLF|;v5RU^w{!>|Y%o2DTri*
    mPv;Nn0O7TG`ijr!(IxMTl&7eCh6)oeH|j*(3?~*;iIhqVpT~O+ZyMfI zJhgEG%Di*o6go7hawdsm>D5Gxdhm_#G{U9yKk$CX`wdUe(T)RgjN({1 zWDh0VO8?O$>>`8hK!q*6; z5)Po}UHtv=G<>Qk+EzU4$DJ~mD7^LQYQZ$_>=ITz<*r#EPM)oBK`#Y@%WG7KZ^ed{y6-H@gKrh zO&p8=0R9;K`|WGF8n+3b<5lFZ^OS8{}%k4@o&Pv5&s5!qblX; z9@pVti+>IN)g<)ouEM_({|fxe@h`)_RJqDdqfWwo3I4_S7vWz>mrmQ*gxYnqtfJ>V z+O-sHDm6y9^Nl3OU!_8I`U^7Gkg0>&WSi)hGom~tmeh*mtzhsB2In)h0qxBrqt`kt zdd8k*5bg}OoMEd1B-7vR5&{|f%g`1A2!!k>pf7he@t!lOJh@n6J$ z0e=Smbo}S>pTnO<$DO)I;gfWl);sS35+~5Rn}TI(9OZ;y8``GP7*eODQqyRerj(+i zs5c^3EDu21)bTkL>Y;1N9!$287VU2BRbuJwCZ~>jnoWHdL*+F7GBSc~!y;&GnKpx9 z^E8^L(zJ%AY4p5Lllhi0`EpPJr&1}cVj2yrX~?m2y4_~fPo++{UqNCK-GAasBc6u; zDgFw44J<1B3BEBR3XBF=@($e^ndzkCE;`R8aU#8HbpOI1!XLyR!0*S;;%D&F_$mAd zUyVy(${{(_m((JGP4Qjo*Wo+(Qj-L-g`dP%fBzf}xlyJgsZzf!uK+naVPh)QKVMCz zh8_*}G|+zp1H0;;@f$>ne`D};247+*PUqh^=agYFG}a3S!^WK1cLMEZOfmL<)K8<{ zG+%KQTPw;1NE?1Dz9#$>ZpLrIZ^V~Yrf5BW9eypS_SNW5`A{vkEU!0c(G9BbEAcg= zE9Of4FY&b$UQ4K+Q{I9vL`sY*SnkoSZ5&++md>Q(Ze0r}(Rm)h1_b2J@8;3~+ofGN>RDx{(6EkLDRY}q9mijTZd2|+H##P7rJ#qYtF;-sj-DfTmd zC%&2o(;LJ$p-u_%Vz!#v=csuO^v2JTSWI`4x*n>8`SWzksjcZ)3qBVqc}t|+6)Am@ zj(c=1e1gvNNt~o$o_1{SscYHE1n1~l{ymM`kp7g+m1L^Pj;BWb^kMXOGjP1g{|bYr z()KYGvZ()tf0T}4u{738hmpzPY=W~0&ZKc2!5IXn6P!k%_NRm36apz{n-iQwprMTC z2~Hq5p5Qow+-QA_R`HI(|AXLY{ND+V!vBrnNPLY`D(MJXgw?P3zfdn9C342LHNo}- z(v*}snqWJEZ3#5Gq^Od%BG{7PA8<8+I_T#~yh)%Fd@X{_2sYK*=eJR%yqge=BG{Nf zCs``Gp~39{y5^9HlevOS6e&IO-|lq@sPa70R9Aec^|Vm(JNjbJLlvjoo&Od*iYe6O+;K2E1LniWv1 z6$qv=C^M1}KT@RNLi)6#rj6img2M<7C6K(`8tSK0FXk=;I}_|gvr&*r zz9y~1=2noooJ=LPd1UM8Ka~3A445xpZgkty^=a0`Ui;6{QQ2(Bl%j^J8?YX~H=R}oxEa0S8T`giFB(hJCx zk-3aa1r0XYTAI~EchdTv((~qsluJjPL-l$J7Ick_V%M-J8q1~Ye<-6 zE+M#>;35LO*aZaV6P!nIE{x0->h;2%1POu;0`(P&LeOT?e z1RoH*Pw*bWy9CP#-XSo3eZdNXT+d?>EF)M-V7lqTdq}@R#p`4)C9{(3byTZ=F|AAf zAOlD1T2d`irX@VH7%HN5i;>an84k@ujqTE7+RETfI_@|3e1y)6shB~poLa4=Ufs8v zdkt#o<4crO%p}2|bhZ;{&P&YS3DljaFH!hcf?o*M5NOWp1$xY+LeZhLZ>MYQ7^;nK z7yD5pz7c_$g(_CxQcsOguA@eOljd|CiMI$e%p_(tt(y{55mXXLX%c58!IuPI5J*o| z)HLe(%>)GZa_! z674z!v5=vcM@F%CSQL#7)B8V}EPYBg`XQ-~8)d!u1H&5&T6kL@-Fhh6Doy{RCN>m6Rb!6QpRS9h5fTJYgR)7nAvd>@{So=+TsQI|HUej9({G{0prgGo%&Y^GC+9 zZ&)0SCDUi5F&ss>G2upp8&dfK;Rb}|gmG$n2(?w#RN?|7UFCm@6uu@>`kY9~uXLDc zkHW|3yp+Cg=-p4TynATeknnuM^9au+)a}n9Je%+=+J7TFlkg1lAAN<>buBxd@H9dV zYi~_>3gO9wClN}udXex1LZu!@r~zF?k0Cso@F+r!f+%_f;o*eR2Yx14O`t~PM_Q!b z3iKZcz9%(A?_+emMCKwgvI;dyR!RRs)Gen)J2(EIZ#jcA8Oo>WZ}<-XYFGq~9rIPL zKCdG9POnpR2#qShw*>M*_fNvD#4FyGPzw@oAk?^s4zW<#w;+@b=M!#5sDUnVv|f4* zl`krLeiIeierM#QBqLJru}JwJgfj`1F~|Rf(0r8AX@t`WpC^2dP~oYB&k{aEIE8RB z;nRdq(eVJCmnm=Iqx5}C!3JS5tCr-L?~@Zyd4R5Al#l% zI<=ys3AZEMmZnZZxdGH3buqWrpW++9jU;4b2Xu@je3Wc;r(=8PWT_f z`v~tf7(POH58>T}cM;x6cn9I_gtrmiN_Y$5&4f1*-biYI^gMbe(z}MtXJoIU&ZbFv zP8$P<)B1t#RBlFe$`(<-ilM*Z)%u%Zkuw&{H@W&N*}9(aI>KuSuOYme@G8P939q1e zLqZ+>BwQ~eyp&K)aQg320O}WpcHW&P!nwhNBkuq|?psfpP#tgf78!DtMX~mL>m6TDTpfUFI zN7FFcTpI~L)V1Ivk@DXOXA{m+u*764HG?K4#(ph*R-|k^seWCHYuAHiznygp&xLAbgx~BH;wW@q~{N zK1wJ-7)R$7dR*Z+k@%2E=?ZFkO#yD8sa)4QE$vuGO%7L0=Ao<+}RG|i%M z7VSIGFpDtvFI77=x7~y~aYZ~yYl5(YP~C^35ax8HBMDmxr7MZoOxQ#y95o!GXgy(_ z9vfRlsGeP$Hr^#v&#n}sy~K6GN=T?ZfmzaX>5@G=pUeufSCCyv|Niv1z^e?bHMh`s z$omW(&(PeFQT%RL6pbD8T@J-c-LP0kLyV%fmp;=-mXs2HK=?l4dxR?Fa>92A-zI#E za2erJ!X<=n5-uiugK!by>-tl{hlC4tE&q-1HNpjiuM)mO_%h*q!j}l=5zZx?L&t;4 zm9M?mSDHs%LDylln|`b0Oj;ykzYwk=l*Xj+&xET9e?35DQygxVvv zqyATPph!#(__c4?SShjZ=#EHbNE>q{e>*bglld=MZL9o}{{5(3PIEB>ztVF$-8VC& zncBax2WHilFvl~6Pf+EXMRCp1$b#amIO@4AxkOTsS*rJpJK8R4gdE2z?YX*5xs zPYC6NO+sWUS$t5WAlLesk6db|OiF&H!&rbNxxuZObY5jrj2IO@OjJlzK$K4uBg!LM zpXy$s^$0cRtgiPj!Xd&z!U4j5!mR$3cPB0DQM(OchA>T-B8&(nb^)PJ=n=Yv(r$&c zO{-045hiI5NY5snC37AX>SZn?tJRuE(%;O07G0YXmzcLMuV(02hUSbcliv@ErmMDYQjGvnT;J4Hh8z8TS`M4KojUPjeyqESThy}Fuc zBccsy9Yv*>8xWNf#fj9+DOyTYLfV6>mIsN<6-=th6TJLCPbmOM!D&K1d5}iVHGSNvyQs3qfoj_zp=gN*II*v#>oOs6&DS8xmMaQiIh$zdYV*<^i0~* zWK7j7+KKF?RO)otw-w9N0_H;)SVNzt)y|?-UQcF^jNp&MB4{j`9}R*XB95`{E&QlTxM(!3+mFOn8|=aBiB>?LGBqkkXz8)=s2 z{eKJ|t~ZHIX6P7(W{r&9Ps3tote2nu*EW`zS{64NLV;$ZhsiEbjg zk?01Z>xr%-(mk#rx|--JqAQ86AiA9BGNMa~E}>x?nvEV%unFyEbT039+UC+~hJ4B- z{&Q(I!I-v`&82ZJ?K{yhmwLl1+Mhb{R*?!xzd$-g?-OJ`B6~5}PwC&Aeq+uTYH8pf z3?8On$(Qu$NcX?d&#WF6OJm2ZHgZ^-E7D=q%q8&=6>^i9rCMPFaO@kZ=2GENX)+aS z<;-0ABtY*HEhl=1=xri1xKdIm5;xsJ`N<*$#yT%wL+^j+cvQg>GbB;^G|?oYCx{*= znn*N(XgtwlM2`~b^QbmFO!N@ZgG6$qcOAW}=sHUOi|H)7m9+0n#VDd~BI!!=h&qW9 zL>)x!%2Ko+b>czPM$}3qT}jbqQtD-9kdBn4SdHU{WG^DSg8scoXjj6)4E#(Ysl@nY zbl=9%-|)r!^RQSNd*ds)#Cy zH1ZwZo21(CXU@jf$!DDHa|hG&Z-!nR zxt{;uuy`6P=l}j!&&xD>GLPnY^i4yDM`gHn2l zvRwWM(Z1B`P{o%>ZAIb}QnJeLAf;`B14@h?Lc(Z4v9Cnp`W`EEE#90+s+4Z@ z36akG_>t%%A~k>?5WP?I9$jY4sU+Y0B~9-bT?^h3DVJU~-lTjlQmW}}Gtg5whLrXI z{6K1bQtQ!m4ADBGzles221$9OpC=udf6IlQoX1>9g)E#s=&xtsAb1V5dFxmD4rSbiB=JPN98EmcO~&( z&=#aY5<2&BG!^bACtPA z)McbDC3Oj@i%DHXN`A|wupLY4d{XC;I+xVHNu5LLY*J^L5^PNCdZf-Ibp|PEYp0Pq zmDDMuP9~+r*e5DW-YvAvCv^g;<4GMyN~)8h$B;Uj)KR3=cPM%UDbtIV9Zu>nQX0RR zPmfjx%E}rNyJ#O0Mt<|Zr(4G#l#$Zpp^(+ap>|T6k&`*hw* z=3TNEko}m3Jo?QrMezk9C0crN2rZw$Qd<5-KlAIb2pW6lS0jC?jYw&%QP1B%H;?s^ zDkl}E!XZ^AM!blw+(1{%MAZF_)wRHAU*!^vsidAIr4_UjNy+chWKz=Do+9-msY%LG zvPz`%2^0E`&Kq^jH|kxX+}@Yd?kZTY5mh;5O2{2TwIDc{)Ip?-uoRp@li)po)c&OQ zBPDf8(S1nmP5W-7_97*9N}N4N?M_M@sZ)yXLQ1Y~cOoTSN>PXORNbt^sPHA7^sXg) zKG}~*$ctkQP11r|89bQ5HFTWJ(Dacda_z8q8jEIa4$mD)?LcaKQc~XZ{L!Sgqfsr* zwj@3!CGAa2X>W3n_7{mb)y4*i9Zc1HnsvUxd>W*@8D89U-o=~I^`a?xp5a8KUM8jg z>uYIPOMX7*lA1$mHmO;pW|Go4q4`(^Z;_gzYxz&4CD;q=V9#VIc zx=XRV8KmwcrMCBWQn!)1mDDYyZYFgTsT)bj$HeuduA@y;T+fm=jZo=BbiGRUJhC4u nR;+y|ns+@w!Tj?W)J`jhp)m|SU%USF>C@L+ukE+0^{4+o7t) bool: @@ -993,10 +995,21 @@ def schedule(self) -> Tuple[List[SequenceGroupMetadata], SchedulerOutputs]: # seq_id -> physical block numbers block_tables: Dict[int, List[int]] = {} + is_prompt = seq_group.is_prefill() for seq in seq_group.get_seqs(status=SequenceStatus.RUNNING): seq_id = seq.seq_id seq_data[seq_id] = seq.data - block_tables[seq_id] = self.block_manager.get_block_table(seq) + if is_prompt or not envs.VLLM_USE_RAY_SPMD_WORKER: + block_table = self.block_manager.get_block_table(seq) + block_tables[seq_id] = block_table + self._block_table_cache[seq_group.request_id][seq_id] = block_table + else: + block_table = self.block_manager.get_block_table(seq) + if len(self._block_table_cache[seq_group.request_id][seq_id]) < len(block_table): + block_tables[seq_id] = [block_table[-1]] + self._block_table_cache[seq_group.request_id][seq_id].append(block_table[-1]) + else: + block_tables[seq_id] = [] self.block_manager.access_all_blocks_in_seq(seq, now) common_computed_block_nums = ( @@ -1004,7 +1017,7 @@ def schedule(self) -> Tuple[List[SequenceGroupMetadata], SchedulerOutputs]: seq_group.get_seqs(status=SequenceStatus.RUNNING))) do_sample = True - if seq_group.is_prefill(): + if is_prompt: seqs = seq_group.get_seqs() # Prefill has only 1 sequence. assert len(seqs) == 1 @@ -1019,7 +1032,6 @@ def schedule(self) -> Tuple[List[SequenceGroupMetadata], SchedulerOutputs]: # It assumes the scheduled_seq_groups is ordered by # prefill < decoding. - is_prompt = seq_group.is_prefill() if is_prompt or not envs.VLLM_USE_RAY_SPMD_WORKER: seq_group_metadata = SequenceGroupMetadata( request_id=seq_group.request_id, diff --git a/vllm/executor/ray_gpu_executor.py b/vllm/executor/ray_gpu_executor.py index 047c05830e6..76dc08d6a61 100644 --- a/vllm/executor/ray_gpu_executor.py +++ b/vllm/executor/ray_gpu_executor.py @@ -292,9 +292,11 @@ def execute_model( serialized_data = self.encoder.encode(execute_model_req) # # Open a file in binary write mode - # with open('example.bin', 'wb') as file: - # # Write bytes to the file - # file.write(serialized_data) + import sys + if sys.getsizeof(serialized_data) > 60000: + with open('example.bin', 'wb') as file: + # Write bytes to the file + file.write(serialized_data) # print(f"SANG-TODO input serialization takes {(time.time() - s) * 1000} ms index: {self.i}") diff --git a/vllm/sequence.py b/vllm/sequence.py index 313dbee21e7..a3f57f25aaf 100644 --- a/vllm/sequence.py +++ b/vllm/sequence.py @@ -109,7 +109,7 @@ class SequenceDataDelta(msgspec.Struct, array_like=True, omit_defaults=True): new_stage: SequenceStage -class SequenceData(msgspec.Struct, array_like=False, omit_defaults=True): +class SequenceData(msgspec.Struct, omit_defaults=True): """Data associated with a sequence. Args: @@ -125,7 +125,7 @@ class SequenceData(msgspec.Struct, array_like=False, omit_defaults=True): _prompt_token_ids: array _output_token_ids: Optional[array] = None - ## The below fields should not be passed as an argument ## + ### The below fields should not be passed as an argument ### cumulative_logprob: float = 0.0 _prompt_token_ids_tuple: Optional[Tuple[int, ...]] = None # The number of tokens that are computed (that run against the model). @@ -661,7 +661,6 @@ def __repr__(self) -> str: class SequenceGroupMetadataDecode(msgspec.Struct, tag=True, array_like=True, omit_defaults=True): """Delta sequence group metadata.""" - seq_data_delta: Dict[int, SequenceDataDelta] request_id: str block_tables: Dict[int, List[int]] @@ -718,7 +717,7 @@ class SequenceGroupMetadata(msgspec.Struct, tag=True, array_like=True, omit_defa # prompt_adapter_request: Optional[PromptAdapterRequest] = None token_chunk_size: Optional[int] = None - ## Stateful fields that are lazily defined. ## + ### Stateful fields that are lazily defined. ### # The number of speculative tokens adopted in this request. # None means specuative decoding is not used. # Zero means speculative decoding is disabled for some reasons. @@ -746,13 +745,15 @@ def prompt_adapter_num_virtual_tokens(self) -> int: return self.prompt_adapter_request.prompt_adapter_num_virtual_tokens \ if self.prompt_adapter_request else 0 - def apply_delta( self, sequence_group_metadata_decode: SequenceGroupMetadataDecode): for id, delta in sequence_group_metadata_decode.seq_data_delta.items(): self.seq_data[id].apply_delta(delta) self.request_id = sequence_group_metadata_decode.request_id - self.block_tables = sequence_group_metadata_decode.block_tables + for seq_id, block_table in sequence_group_metadata_decode.block_tables.items(): + if len(block_table) > 0: + self.block_tables[seq_id].append(block_table[0]) + # self.block_tables = sequence_group_metadata_decode.block_tables self.token_chunk_size = sequence_group_metadata_decode.token_chunk_size self.do_sample = sequence_group_metadata_decode.do_sample self.is_prompt = False diff --git a/vllm/worker/worker_base.py b/vllm/worker/worker_base.py index 76eebe76f62..03e3857e23c 100644 --- a/vllm/worker/worker_base.py +++ b/vllm/worker/worker_base.py @@ -282,9 +282,6 @@ def execute_model( # output is List[SamplerOutput] return output -# 1. spmd -> general & used by other backend easier -# 2. - def _execute_model_spmd( self, execute_model_req: ExecuteModelRequest ) -> Optional[List[SamplerOutput]]: From a906a9d8521081771df94a77845c212c414488dd Mon Sep 17 00:00:00 2001 From: sang Date: Tue, 30 Jul 2024 23:10:44 -0700 Subject: [PATCH 10/36] msgspec migration done --- a.py | 9 ++++- b.py | 13 +++--- vllm/adapter_commons/request.py | 1 - vllm/core/scheduler.py | 15 ++++--- vllm/executor/ray_gpu_executor.py | 1 + vllm/executor/ray_utils.py | 6 ++- vllm/lora/request.py | 6 +-- vllm/multimodal/base.py | 7 ++-- vllm/multimodal/image.py | 6 +-- vllm/multimodal/registry.py | 10 ++--- vllm/pooling_params.py | 7 ++-- vllm/prompt_adapter/request.py | 4 +- vllm/sequence.py | 56 +++++++++++++++----------- vllm/spec_decode/spec_decode_worker.py | 4 +- 14 files changed, 84 insertions(+), 61 deletions(-) diff --git a/a.py b/a.py index 78e769455e4..bf6f24ac543 100644 --- a/a.py +++ b/a.py @@ -7,6 +7,7 @@ with open('example.bin', 'rb') as file: data = file.read() + def dec_hook(type, obj): # `type` here is the value of the custom type annotation being decoded. if type is array: @@ -14,12 +15,15 @@ def dec_hook(type, obj): deserialized.frombytes(obj) return deserialized + def enc_hook(obj): if isinstance(obj, array): # convert the complex to a tuple of real, imag return obj.tobytes() - + + class Timer: + def __init__(self, msg): self.msg = msg @@ -32,11 +36,12 @@ def __exit__(self, exc_type, exc_val, exc_tb): self.elapsed_us = (self.end - self.start) * 1000 * 1000 print(f"{self.msg=}. Elapsed time: {self.elapsed_us:.2f} us") + # encoder = msgspec.msgpack.Encoder(enc_hook=enc_hook) # decoder = msgspec.msgpack.Decoder(ExecuteModelRequest, dec_hook=dec_hook) # with Timer("Serialization"): - # serialized = encoder.encode(data) +# serialized = encoder.encode(data) # print(f"{sys.getsizeof(data)=}") # with Timer("Deserialization original"): # decoder.decode(data) diff --git a/b.py b/b.py index d8a4a29e6c6..7d591897247 100644 --- a/b.py +++ b/b.py @@ -1,6 +1,7 @@ import time from array import array + def t(): l = [i for i in range(256)] s = time.time() @@ -8,11 +9,12 @@ def t(): a.fromlist(l) print((time.time() - s) * 1000 * 1000, "us") -t() +t() import msgspec + def dec_hook(type, obj): # `type` here is the value of the custom type annotation being decoded. if type is array: @@ -20,12 +22,15 @@ def dec_hook(type, obj): deserialized.frombytes(obj) return deserialized + def enc_hook(obj): if isinstance(obj, array): # convert the complex to a tuple of real, imag return obj.tobytes() - + + class Timer: + def __init__(self, msg): self.msg = msg @@ -38,13 +43,13 @@ def __exit__(self, exc_type, exc_val, exc_tb): self.elapsed_us = (self.end - self.start) * 1000 * 1000 print(f"{self.msg=}. Elapsed time: {self.elapsed_us:.2f} us") + encoder = msgspec.msgpack.Encoder(enc_hook=enc_hook) decoder = msgspec.msgpack.Decoder(dec_hook=dec_hook) l = [i for i in range(256)] d = {"1": l} - with Timer("Serialization array"): # a = array('l') # a.fromlist(l) @@ -56,7 +61,6 @@ def __exit__(self, exc_type, exc_val, exc_tb): a = array('l') a.fromlist(l) - with Timer("Serialization bigger array"): # a = array('l') # a.fromlist(l) @@ -64,7 +68,6 @@ def __exit__(self, exc_type, exc_val, exc_tb): with Timer("Deserialization"): data = decoder.decode(data) - # for _ in range(5): # with Timer("Serialization list"): # data = encoder.encode(l) diff --git a/vllm/adapter_commons/request.py b/vllm/adapter_commons/request.py index 69775ab7d45..b9ebc50cc5a 100644 --- a/vllm/adapter_commons/request.py +++ b/vllm/adapter_commons/request.py @@ -2,7 +2,6 @@ from dataclasses import dataclass -@dataclass class AdapterRequest: """ Base class for adapter requests. diff --git a/vllm/core/scheduler.py b/vllm/core/scheduler.py index a66ab69d882..c18d4a72ce0 100644 --- a/vllm/core/scheduler.py +++ b/vllm/core/scheduler.py @@ -334,7 +334,8 @@ def __init__( else 0) self.num_cumulative_preemption: int = 0 from collections import defaultdict - self._block_table_cache: Dict[int, Dict[int, List[int]]] = defaultdict(dict) + self._block_table_cache: Dict[int, Dict[int, + List[int]]] = defaultdict(dict) @property def lora_enabled(self) -> bool: @@ -1001,15 +1002,19 @@ def schedule(self) -> Tuple[List[SequenceGroupMetadata], SchedulerOutputs]: for seq in seq_group.get_seqs(status=SequenceStatus.RUNNING): seq_id = seq.seq_id seq_data[seq_id] = seq.data - if is_prompt or not envs.VLLM_USE_RAY_SPMD_WORKER: + if is_prompt or not envs.VLLM_USE_RAY_SPMD_WORKER or True: block_table = self.block_manager.get_block_table(seq) block_tables[seq_id] = block_table - self._block_table_cache[seq_group.request_id][seq_id] = block_table + self._block_table_cache[ + seq_group.request_id][seq_id] = block_table else: block_table = self.block_manager.get_block_table(seq) - if len(self._block_table_cache[seq_group.request_id][seq_id]) < len(block_table): + if len(self._block_table_cache[seq_group.request_id] + [seq_id]) < len(block_table): block_tables[seq_id] = [block_table[-1]] - self._block_table_cache[seq_group.request_id][seq_id].append(block_table[-1]) + self._block_table_cache[ + seq_group.request_id][seq_id].append( + block_table[-1]) else: block_tables[seq_id] = [] self.block_manager.access_all_blocks_in_seq(seq, now) diff --git a/vllm/executor/ray_gpu_executor.py b/vllm/executor/ray_gpu_executor.py index 76dc08d6a61..7b98fb11d3e 100644 --- a/vllm/executor/ray_gpu_executor.py +++ b/vllm/executor/ray_gpu_executor.py @@ -67,6 +67,7 @@ def _init_executor(self) -> None: self._init_workers_ray(placement_group) self.forward_dag: Optional["ray.dag.CompiledDAG"] = None + def enc_hook(obj: Any) -> Any: if isinstance(obj, array): # convert the complex to a tuple of real, imag diff --git a/vllm/executor/ray_utils.py b/vllm/executor/ray_utils.py index 7801242216d..67f3bb69723 100644 --- a/vllm/executor/ray_utils.py +++ b/vllm/executor/ray_utils.py @@ -34,7 +34,8 @@ def dec_hook(type: Type, obj: Any) -> Any: deserialized.frombytes(obj) return deserialized - self.decoder = msgspec.msgpack.Decoder(ExecuteModelRequest, dec_hook=dec_hook) + self.decoder = msgspec.msgpack.Decoder(ExecuteModelRequest, + dec_hook=dec_hook) def get_node_ip(self) -> str: return get_ip() @@ -49,7 +50,8 @@ def execute_model_spmd(self, execute_model_req: bytes): enabled.""" s = time.time() - execute_model_req: ExecuteModelRequest = self.decoder.decode(execute_model_req) + execute_model_req: ExecuteModelRequest = self.decoder.decode( + execute_model_req) # execute_model_req: ExecuteModelRequest = pickle.loads(execute_model_req) # print(f"SANG-TODO input deserialization takes {(time.time() - s) * 1000} ms index: {self.i}") # TODO(swang): This is needed right now because Ray aDAG executes diff --git a/vllm/lora/request.py b/vllm/lora/request.py index 5d791424fbe..002cceae475 100644 --- a/vllm/lora/request.py +++ b/vllm/lora/request.py @@ -3,10 +3,10 @@ from typing import Optional from vllm.adapter_commons.request import AdapterRequest +import msgspec -@dataclass -class LoRARequest(AdapterRequest): +class LoRARequest(msgspec.Struct, AdapterRequest): """ Request for a LoRA adapter. @@ -22,7 +22,7 @@ class LoRARequest(AdapterRequest): lora_name: str lora_int_id: int lora_path: str = "" - lora_local_path: Optional[str] = field(default=None, repr=False) + lora_local_path: Optional[str] = msgspec.field(default=None) long_lora_max_len: Optional[int] = None __hash__ = AdapterRequest.__hash__ diff --git a/vllm/multimodal/base.py b/vllm/multimodal/base.py index 5abd0ad61cd..26d76814576 100644 --- a/vllm/multimodal/base.py +++ b/vllm/multimodal/base.py @@ -10,7 +10,7 @@ from PIL import Image from torch import nn -from vllm.config import ModelConfig +# from vllm.config import ModelConfig from vllm.inputs import InputContext from vllm.logger import init_logger @@ -200,8 +200,7 @@ def wrapper(model_cls: N) -> N: return wrapper - def map_input(self, model_config: ModelConfig, - data: object) -> MultiModalInputs: + def map_input(self, model_config: Any, data: object) -> MultiModalInputs: """ Transform the data into a dictionary of model inputs using the input mapper registered for that model. @@ -271,7 +270,7 @@ def wrapper(model_cls: N) -> N: return wrapper - def get_max_multimodal_tokens(self, model_config: ModelConfig) -> int: + def get_max_multimodal_tokens(self, model_config: Any) -> int: """ Get the maximum number of multi-modal tokens for profiling the memory usage of a model. diff --git a/vllm/multimodal/image.py b/vllm/multimodal/image.py index 3b37ce9149f..02e878f8fe5 100644 --- a/vllm/multimodal/image.py +++ b/vllm/multimodal/image.py @@ -1,11 +1,11 @@ from functools import lru_cache -from typing import List, Optional, Tuple, TypeVar +from typing import List, Optional, Tuple, TypeVar, Any import torch from PIL import Image from transformers import PreTrainedTokenizerBase -from vllm.config import ModelConfig +# from vllm.config import ModelConfig from vllm.inputs.registry import InputContext from vllm.logger import init_logger from vllm.transformers_utils.image_processor import get_image_processor @@ -105,7 +105,7 @@ class ImagePlugin(MultiModalPlugin): def get_data_key(self) -> str: return "image" - def _get_hf_image_processor(self, model_config: ModelConfig): + def _get_hf_image_processor(self, model_config: Any): return cached_get_image_processor( model_config.model, trust_remote_code=model_config.trust_remote_code) diff --git a/vllm/multimodal/registry.py b/vllm/multimodal/registry.py index d8e1b68178a..8d531fb4be2 100644 --- a/vllm/multimodal/registry.py +++ b/vllm/multimodal/registry.py @@ -1,9 +1,9 @@ import functools -from typing import Dict, Optional, Sequence +from typing import Dict, Optional, Sequence, Any import torch -from vllm.config import ModelConfig +# from vllm.config import ModelConfig from vllm.logger import init_logger from .base import (MultiModalDataDict, MultiModalInputMapper, MultiModalInputs, @@ -75,7 +75,7 @@ def register_image_input_mapper( """ return self.register_input_mapper("image", mapper) - def map_input(self, model_config: ModelConfig, + def map_input(self, model_config: Any, data: MultiModalDataDict) -> MultiModalInputs: """ Apply an input mapper to the data passed to the model. @@ -102,7 +102,7 @@ def map_input(self, model_config: ModelConfig, return MultiModalInputs(merged_dict) - def create_input_mapper(self, model_config: ModelConfig): + def create_input_mapper(self, model_config: Any): """ Create an input mapper (see :meth:`map_input`) for a specific model. """ @@ -130,7 +130,7 @@ def register_max_image_tokens( """ return self.register_max_multimodal_tokens("image", max_mm_tokens) - def get_max_multimodal_tokens(self, model_config: ModelConfig) -> int: + def get_max_multimodal_tokens(self, model_config: Any) -> int: """ Get the maximum number of multi-modal tokens for profiling the memory usage of a model. diff --git a/vllm/pooling_params.py b/vllm/pooling_params.py index 3b95d73ddc2..f015504e5b4 100644 --- a/vllm/pooling_params.py +++ b/vllm/pooling_params.py @@ -1,15 +1,14 @@ from typing import Any, Optional +import msgspec -class PoolingParams: +class PoolingParams(msgspec.Struct, omit_defaults=True): """Pooling parameters for pooling. Attributes: additional_data: Any additional data needed for pooling. """ - - def __init__(self, additional_data: Optional[Any] = None): - self.additional_data = additional_data + additional_data: Optional[Any] = None def clone(self) -> "PoolingParams": """Returns a deep copy of the PoolingParams instance.""" diff --git a/vllm/prompt_adapter/request.py b/vllm/prompt_adapter/request.py index c0c98cf72bb..a7e71a75f6c 100644 --- a/vllm/prompt_adapter/request.py +++ b/vllm/prompt_adapter/request.py @@ -1,10 +1,10 @@ from dataclasses import dataclass from vllm.adapter_commons.request import AdapterRequest +import msgspec -@dataclass -class PromptAdapterRequest(AdapterRequest): +class PromptAdapterRequest(msgspec.Struct, AdapterRequest, array_like=True): """ Request for a Prompt adapter. """ diff --git a/vllm/sequence.py b/vllm/sequence.py index d758e0631ea..112e2d4ce98 100644 --- a/vllm/sequence.py +++ b/vllm/sequence.py @@ -19,7 +19,7 @@ if TYPE_CHECKING: from vllm.inputs import LLMInputs - from vllm.multimodal import MultiModalDataDict + from vllm.multimodal.base import MultiModalDataDict from vllm.spec_decode.metrics import SpecDecodeWorkerMetrics @@ -135,15 +135,15 @@ class SequenceData(msgspec.Struct, omit_defaults=True): _new_appended_tokens: List[int] = msgspec.field(default_factory=list) _cached_all_token_ids: List[int] = msgspec.field(default_factory=list) - def __post_init__( - self, - ) -> None: + def __post_init__(self, ) -> None: if not isinstance(self._prompt_token_ids, array): self._prompt_token_ids = array('l', self._prompt_token_ids) if not isinstance(self._output_token_ids, array): self._output_token_ids = array( - 'l', self._output_token_ids if self._output_token_ids is not None else []) - self._prompt_token_ids_tuple: Tuple[int, ...] = tuple(self._prompt_token_ids) + 'l', self._output_token_ids + if self._output_token_ids is not None else []) + self._prompt_token_ids_tuple: Tuple[int, ...] = tuple( + self._prompt_token_ids) self._update_cached_all_tokens() def _update_cached_all_tokens(self): @@ -650,7 +650,10 @@ def __repr__(self) -> str: f"num_seqs={len(self.seqs_dict)})") -class SequenceGroupMetadataDecode(msgspec.Struct, tag=True, array_like=True, omit_defaults=True): +class SequenceGroupMetadataDecode(msgspec.Struct, + tag=True, + array_like=True, + omit_defaults=True): """Delta sequence group metadata.""" seq_data_delta: Dict[int, SequenceDataDelta] request_id: str @@ -659,7 +662,10 @@ class SequenceGroupMetadataDecode(msgspec.Struct, tag=True, array_like=True, omi token_chunk_size: Optional[int] = None -class SequenceGroupMetadata(msgspec.Struct, tag=True, array_like=True, omit_defaults=True): +class SequenceGroupMetadata(msgspec.Struct, + tag=True, + array_like=True, + omit_defaults=True): """Metadata for a sequence group. Used to create `AttentionMetadata`. Args: @@ -696,15 +702,15 @@ class SequenceGroupMetadata(msgspec.Struct, tag=True, array_like=True, omit_defa sampling_params: SamplingParams block_tables: Dict[int, List[int]] do_sample: bool = True - # pooling_params: Optional[PoolingParams] = None - # lora_request: Optional[LoRARequest] = None + pooling_params: Optional[PoolingParams] = None + lora_request: Optional[LoRARequest] = None computed_block_nums: Optional[List[int]] = None - # state: Optional[SequenceGroupState] = None - # # from vllm.multimodal import MultiModalDataDict + # "MultiModalDataDict" types. We have to use Any due to msgspec + # doesn't allow to have union of 2 different dicts. multi_modal_data: Optional[Any] = None - # encoder_seq_data: Optional[SequenceData] = None - # cross_block_table: Optional[List[int]] = None - # prompt_adapter_request: Optional[PromptAdapterRequest] = None + encoder_seq_data: Optional[SequenceData] = None + cross_block_table: Optional[List[int]] = None + prompt_adapter_request: Optional[PromptAdapterRequest] = None token_chunk_size: Optional[int] = None ### Stateful fields that are lazily defined. ### @@ -717,7 +723,8 @@ class SequenceGroupMetadata(msgspec.Struct, tag=True, array_like=True, omit_defa def __post_init__(self): if self.token_chunk_size is None: if self.is_prompt: - self.token_chunk_size = list(self.seq_data.values())[0].get_len() + self.token_chunk_size = list( + self.seq_data.values())[0].get_len() else: self.token_chunk_size = 1 @@ -740,10 +747,10 @@ def apply_delta( for id, delta in sequence_group_metadata_decode.seq_data_delta.items(): self.seq_data[id].apply_delta(delta) self.request_id = sequence_group_metadata_decode.request_id - for seq_id, block_table in sequence_group_metadata_decode.block_tables.items(): - if len(block_table) > 0: - self.block_tables[seq_id].append(block_table[0]) - # self.block_tables = sequence_group_metadata_decode.block_tables + # for seq_id, block_table in sequence_group_metadata_decode.block_tables.items(): + # if len(block_table) > 0: + # self.block_tables[seq_id].append(block_table[0]) + self.block_tables = sequence_group_metadata_decode.block_tables self.token_chunk_size = sequence_group_metadata_decode.token_chunk_size self.do_sample = sequence_group_metadata_decode.do_sample self.is_prompt = False @@ -999,11 +1006,14 @@ class ExecuteModelRequest(msgspec.Struct, array_like=True, omit_defaults=True): """The model execution request, containing CPU metadata only. The LLM engine should create an instance of this class for each request batch.""" # The sequence group metadata list. - seq_group_metadata_list: List[Union[SequenceGroupMetadata, SequenceGroupMetadataDecode]] + seq_group_metadata_list: List[Union[SequenceGroupMetadata, + SequenceGroupMetadataDecode]] # Blocks to swap in. List of CPU -> GPU block number. - blocks_to_swap_in: List[Tuple[int, int]] = msgspec.field(default_factory=list) + blocks_to_swap_in: List[Tuple[int, + int]] = msgspec.field(default_factory=list) # Blocks to swap out. List of GPU -> CPU block number. - blocks_to_swap_out: List[Tuple[int, int]] = msgspec.field(default_factory=list) + blocks_to_swap_out: List[Tuple[int, + int]] = msgspec.field(default_factory=list) # Blocks to copy. Source to dest block. blocks_to_copy: List[Tuple[int, int]] = msgspec.field(default_factory=list) # Virtual engine ID for pipeline parallel. diff --git a/vllm/spec_decode/spec_decode_worker.py b/vllm/spec_decode/spec_decode_worker.py index 95811f7675d..91674427dbd 100644 --- a/vllm/spec_decode/spec_decode_worker.py +++ b/vllm/spec_decode/spec_decode_worker.py @@ -635,8 +635,8 @@ def _verify_tokens( index = accepted_index[:, None, None].expand(-1, 1, hs_size) hidden_states = hidden_states.gather(1, index).squeeze(1) # b x d # Store hidden states from target model for subsequent decode step - self.previous_hidden_states = HiddenStates(get_all_seq_ids(seq_group_metadata_list), - hidden_states) + self.previous_hidden_states = HiddenStates( + get_all_seq_ids(seq_group_metadata_list), hidden_states) return accepted_token_ids, logprobs From 4af66997975eb2b53ff3c41dd00209557bff34c1 Mon Sep 17 00:00:00 2001 From: sang Date: Thu, 1 Aug 2024 00:24:30 -0700 Subject: [PATCH 11/36] ip. preemption and chunked prefill not working yet. --- .buildkite/test-pipeline.yaml | 1 + .../test_basic_distributed_correctness.py | 2 +- vllm/executor/ray_gpu_executor.py | 2 ++ vllm/worker/worker.py | 12 ++++++++---- 4 files changed, 12 insertions(+), 5 deletions(-) diff --git a/.buildkite/test-pipeline.yaml b/.buildkite/test-pipeline.yaml index 91418e5ec17..7b705b958e2 100644 --- a/.buildkite/test-pipeline.yaml +++ b/.buildkite/test-pipeline.yaml @@ -88,6 +88,7 @@ steps: - TEST_DIST_MODEL=facebook/opt-125m DISTRIBUTED_EXECUTOR_BACKEND=ray VLLM_USE_RAY_SPMD_WORKER=1 VLLM_USE_RAY_COMPILED_DAG=1 pytest -v -s distributed/test_basic_distributed_correctness.py - TEST_DIST_MODEL=meta-llama/Llama-2-7b-hf DISTRIBUTED_EXECUTOR_BACKEND=ray VLLM_USE_RAY_SPMD_WORKER=1 VLLM_USE_RAY_COMPILED_DAG=1 pytest -v -s distributed/test_basic_distributed_correctness.py - TEST_DIST_MODEL=facebook/opt-125m DISTRIBUTED_EXECUTOR_BACKEND=ray pytest -v -s distributed/test_chunked_prefill_distributed.py + - TEST_DIST_MODEL=facebook/opt-125m DISTRIBUTED_EXECUTOR_BACKEND=ray VLLM_USE_RAY_SPMD_WORKER=1 VLLM_USE_RAY_COMPILED_DAG=1 pytest -v -s distributed/test_chunked_prefill_distributed.py - TEST_DIST_MODEL=meta-llama/Llama-2-7b-hf DISTRIBUTED_EXECUTOR_BACKEND=ray pytest -v -s distributed/test_chunked_prefill_distributed.py - TEST_DIST_MODEL=llava-hf/llava-1.5-7b-hf DISTRIBUTED_EXECUTOR_BACKEND=ray pytest -v -s distributed/test_multimodal_broadcast.py - TEST_DIST_MODEL=microsoft/Phi-3-vision-128k-instruct DISTRIBUTED_EXECUTOR_BACKEND=ray pytest -v -s distributed/test_multimodal_broadcast.py diff --git a/tests/distributed/test_basic_distributed_correctness.py b/tests/distributed/test_basic_distributed_correctness.py index 7f2ad6cc728..7a0e5673b2c 100644 --- a/tests/distributed/test_basic_distributed_correctness.py +++ b/tests/distributed/test_basic_distributed_correctness.py @@ -47,7 +47,7 @@ def test_models( # will hurt multiprocessing backend with fork method (the default method). with vllm_runner(model, dtype=dtype, - tensor_parallel_size=1, + tensor_parallel_size=2, distributed_executor_backend=distributed_executor_backend ) as vllm_model: vllm_outputs = vllm_model.generate_greedy(example_prompts, max_tokens) diff --git a/vllm/executor/ray_gpu_executor.py b/vllm/executor/ray_gpu_executor.py index 7b98fb11d3e..b11699a38b5 100644 --- a/vllm/executor/ray_gpu_executor.py +++ b/vllm/executor/ray_gpu_executor.py @@ -8,6 +8,8 @@ import pickle from array import array +from regex import P + import vllm.envs as envs from vllm.executor.distributed_gpu_executor import ( # yapf: disable DistributedGPUExecutor, DistributedGPUExecutorAsync) diff --git a/vllm/worker/worker.py b/vllm/worker/worker.py index ac473d3791d..80b73305152 100644 --- a/vllm/worker/worker.py +++ b/vllm/worker/worker.py @@ -303,10 +303,14 @@ def _get_cached_seq_group_metadata(self, seq_group_metadata_list): assert isinstance(metadata_or_delta, SequenceGroupMetadata) self._seq_group_metadata_cache[request_id] = metadata_or_delta else: - assert isinstance(metadata_or_delta, - SequenceGroupMetadataDecode) - self._seq_group_metadata_cache[request_id].apply_delta( - metadata_or_delta) + if isinstance(metadata_or_delta, SequenceGroupMetadataDecode): + self._seq_group_metadata_cache[request_id].apply_delta( + metadata_or_delta) + else: + # If metadata snapshot is sent again, it is either preempted, + # or chunked prefill. + assert isinstance(metadata_or_delta, SequenceGroupMetadata) + self._seq_group_metadata_cache[request_id] = metadata_or_delta new_seq_group_metadata_list.append( self._seq_group_metadata_cache[request_id]) return new_seq_group_metadata_list From 1e6196bd5bc500186cef8b1a110fdd3042cbbe32 Mon Sep 17 00:00:00 2001 From: sang Date: Fri, 2 Aug 2024 17:40:55 -0700 Subject: [PATCH 12/36] working e2e --- .buildkite/test-pipeline.yaml | 1 + b.py | 39 ++--- c.py | 20 +++ tests/prompts/example.txt | 9 +- vllm/adapter_commons/request.py | 1 - vllm/core/scheduler.py | 59 ++++---- vllm/executor/ray_gpu_executor.py | 34 ++--- vllm/executor/ray_utils.py | 33 +++-- vllm/inputs/registry.py | 3 +- vllm/lora/request.py | 6 +- vllm/model_executor/models/blip.py | 4 +- vllm/model_executor/models/blip2.py | 4 +- vllm/model_executor/models/chameleon.py | 4 +- vllm/model_executor/models/clip.py | 4 +- vllm/model_executor/models/fuyu.py | 4 +- vllm/model_executor/models/minicpmv.py | 4 +- vllm/model_executor/models/paligemma.py | 4 +- vllm/model_executor/sampling_metadata.py | 4 +- vllm/pooling_params.py | 2 +- vllm/prompt_adapter/request.py | 7 +- vllm/sampling_params.py | 23 ++- vllm/sequence.py | 177 ++++++++++++----------- vllm/spec_decode/batch_expansion.py | 6 +- vllm/spec_decode/metrics.py | 7 +- vllm/spec_decode/spec_decode_worker.py | 7 +- vllm/worker/model_runner.py | 6 +- vllm/worker/worker.py | 11 +- 27 files changed, 253 insertions(+), 230 deletions(-) create mode 100644 c.py diff --git a/.buildkite/test-pipeline.yaml b/.buildkite/test-pipeline.yaml index 7b705b958e2..7eee769a34f 100644 --- a/.buildkite/test-pipeline.yaml +++ b/.buildkite/test-pipeline.yaml @@ -98,6 +98,7 @@ steps: - TEST_DIST_MODEL=meta-llama/Llama-2-7b-hf DISTRIBUTED_EXECUTOR_BACKEND=mp pytest -v -s distributed/test_chunked_prefill_distributed.py - TEST_DIST_MODEL=llava-hf/llava-1.5-7b-hf DISTRIBUTED_EXECUTOR_BACKEND=mp pytest -v -s distributed/test_multimodal_broadcast.py - TEST_DIST_MODEL=microsoft/Phi-3-vision-128k-instruct DISTRIBUTED_EXECUTOR_BACKEND=mp pytest -v -s distributed/test_multimodal_broadcast.py + - DISTRIBUTED_EXECUTOR_BACKEND=ray VLLM_USE_RAY_SPMD_WORKER=1 VLLM_USE_RAY_COMPILED_DAG=1 pytest -v -s spec_decode/e2e/test_integration_dist_tp2.py - pytest -v -s spec_decode/e2e/test_integration_dist_tp2.py - CUDA_VISIBLE_DEVICES=0,1 pytest -v -s test_sharded_state_loader.py - CUDA_VISIBLE_DEVICES=0,1 pytest -v -s distributed/test_utils.py diff --git a/b.py b/b.py index 7d591897247..39c6cdc3ab1 100644 --- a/b.py +++ b/b.py @@ -1,5 +1,6 @@ import time from array import array +from vllm.sequence import SequenceData def t(): @@ -47,29 +48,29 @@ def __exit__(self, exc_type, exc_val, exc_tb): encoder = msgspec.msgpack.Encoder(enc_hook=enc_hook) decoder = msgspec.msgpack.Decoder(dec_hook=dec_hook) -l = [i for i in range(256)] -d = {"1": l} +# l = [i for i in range(256)] +# d = {"1": l} -with Timer("Serialization array"): - # a = array('l') - # a.fromlist(l) - data = encoder.encode(a) -with Timer("Deserialization"): - data = decoder.decode(data) +# with Timer("Serialization array"): +# # a = array('l') +# # a.fromlist(l) +# data = encoder.encode(a) +# with Timer("Deserialization"): +# data = decoder.decode(data) -l = [i for i in range(256)] -a = array('l') +l = [i for i in range(64 * 256)] +a = array('I') a.fromlist(l) +# a = SequenceData(a) -with Timer("Serialization bigger array"): - # a = array('l') - # a.fromlist(l) +# with Timer("Serialization sequence data"): +# # a = array('l') +# # a.fromlist(l) +# data = encoder.encode(a) +# with Timer("Deserialization"): +# data = decoder.decode(data) + +with Timer("Serialization array"): data = encoder.encode(a) with Timer("Deserialization"): data = decoder.decode(data) - -# for _ in range(5): -# with Timer("Serialization list"): -# data = encoder.encode(l) -# with Timer("Deserialization"): -# data = decoder.decode(data) diff --git a/c.py b/c.py new file mode 100644 index 00000000000..86a55fe0c44 --- /dev/null +++ b/c.py @@ -0,0 +1,20 @@ +import time +import numpy as np + +class Timer: + + def __init__(self, msg): + self.msg = msg + + def __enter__(self): + self.start = time.time() + return self # This allows access to the instance in the 'as' part of the context manager + + def __exit__(self, exc_type, exc_val, exc_tb): + self.end = time.time() + self.elapsed_us = (self.end - self.start) * 1000 * 1000 + print(f"{self.msg=}. Elapsed time: {self.elapsed_us:.2f} us") +l = [i for i in range(4096)] +from array import array +with Timer("converesion"): + arr = array("I", l) diff --git a/tests/prompts/example.txt b/tests/prompts/example.txt index cef4d1d7687..6e8c45b673e 100644 --- a/tests/prompts/example.txt +++ b/tests/prompts/example.txt @@ -1,8 +1 @@ -vLLM is a high-throughput and memory-efficient inference and serving engine for LLMs. -Briefly describe the major milestones in the development of artificial intelligence from 1950 to 2020. -Compare and contrast artificial intelligence with human intelligence in terms of processing information. -Describe the basic components of a neural network and how it can be trained. -Write a short story about a robot that dreams for the first time. -Analyze the impact of the COVID-19 pandemic on global economic structures and future business models. -Explain the cultural significance of the Mona Lisa painting, and how its perception might vary in Western versus Eastern societies. -Translate the following English sentence into Japanese, French, and Swahili: 'The early bird catches the worm.' \ No newline at end of file +vLLM is a high-throughput and memory-efficient inference and serving engine for LLMs. \ No newline at end of file diff --git a/vllm/adapter_commons/request.py b/vllm/adapter_commons/request.py index b9ebc50cc5a..c4398afb74f 100644 --- a/vllm/adapter_commons/request.py +++ b/vllm/adapter_commons/request.py @@ -1,5 +1,4 @@ from abc import abstractmethod -from dataclasses import dataclass class AdapterRequest: diff --git a/vllm/core/scheduler.py b/vllm/core/scheduler.py index c18d4a72ce0..8c9fce6515d 100644 --- a/vllm/core/scheduler.py +++ b/vllm/core/scheduler.py @@ -14,7 +14,7 @@ from vllm.lora.request import LoRARequest from vllm.prompt_adapter.request import PromptAdapterRequest from vllm.sequence import (Sequence, SequenceData, SequenceGroup, - SequenceGroupMetadata, SequenceGroupMetadataDecode, + SequenceGroupMetadata, SequenceGroupMetadataDelta, SequenceStatus) logger = init_logger(__name__) @@ -333,9 +333,6 @@ def __init__( if self.enable_artificial_preemption else 0) self.num_cumulative_preemption: int = 0 - from collections import defaultdict - self._block_table_cache: Dict[int, Dict[int, - List[int]]] = defaultdict(dict) @property def lora_enabled(self) -> bool: @@ -998,25 +995,11 @@ def schedule(self) -> Tuple[List[SequenceGroupMetadata], SchedulerOutputs]: # seq_id -> physical block numbers block_tables: Dict[int, List[int]] = {} - is_prompt = seq_group.is_prefill() for seq in seq_group.get_seqs(status=SequenceStatus.RUNNING): seq_id = seq.seq_id seq_data[seq_id] = seq.data - if is_prompt or not envs.VLLM_USE_RAY_SPMD_WORKER or True: - block_table = self.block_manager.get_block_table(seq) - block_tables[seq_id] = block_table - self._block_table_cache[ - seq_group.request_id][seq_id] = block_table - else: - block_table = self.block_manager.get_block_table(seq) - if len(self._block_table_cache[seq_group.request_id] - [seq_id]) < len(block_table): - block_tables[seq_id] = [block_table[-1]] - self._block_table_cache[ - seq_group.request_id][seq_id].append( - block_table[-1]) - else: - block_tables[seq_id] = [] + block_table = self.block_manager.get_block_table(seq) + block_tables[seq_id] = block_table self.block_manager.access_all_blocks_in_seq(seq, now) common_computed_block_nums = ( @@ -1024,22 +1007,30 @@ def schedule(self) -> Tuple[List[SequenceGroupMetadata], SchedulerOutputs]: seq_group.get_seqs(status=SequenceStatus.RUNNING))) do_sample = True + is_prompt = seq_group.is_prefill() + # We should send the metadata to workers when the first prefill + # is sent. Subsequent requests could be chunked prefill or decode. + is_first_prefill = False if is_prompt: seqs = seq_group.get_seqs() # Prefill has only 1 sequence. assert len(seqs) == 1 + num_computed_tokens = seqs[0].data.get_num_computed_tokens() + is_first_prefill = num_computed_tokens == 0 # In the next iteration, all prompt tokens are not computed. # It means the prefill is chunked, and we don't need sampling. # NOTE: We use get_len instead of get_prompt_len because when # a sequence is preempted, prefill includes previous generated # output tokens. - if (token_chunk_size + seqs[0].data.get_num_computed_tokens() < + if (token_chunk_size + num_computed_tokens < seqs[0].data.get_len()): do_sample = False # It assumes the scheduled_seq_groups is ordered by # prefill < decoding. - if is_prompt or not envs.VLLM_USE_RAY_SPMD_WORKER: + # When SPMD mode is enabled, we only send delta data except for + # the first request to reduce serialization cost. + if is_first_prefill or not envs.VLLM_USE_RAY_SPMD_WORKER: seq_group_metadata = SequenceGroupMetadata( request_id=seq_group.request_id, is_prompt=is_prompt, @@ -1047,31 +1038,31 @@ def schedule(self) -> Tuple[List[SequenceGroupMetadata], SchedulerOutputs]: sampling_params=seq_group.sampling_params, block_tables=block_tables, do_sample=do_sample, - # pooling_params=seq_group.pooling_params, + pooling_params=seq_group.pooling_params, token_chunk_size=token_chunk_size, - # lora_request=seq_group.lora_request, + lora_request=seq_group.lora_request, computed_block_nums=common_computed_block_nums, - # state=seq_group.state, - # # `multi_modal_data` will only be present for the 1st comm - # # between engine and worker. - # # the subsequent comms can still use delta, but - # # `multi_modal_data` will be None. - # multi_modal_data=seq_group.multi_modal_data - # if scheduler_outputs.num_prefill_groups > 0 else None, - # prompt_adapter_request=seq_group.prompt_adapter_request, + # `multi_modal_data` will only be present for the 1st comm + # between engine and worker. + # the subsequent comms can still use delta, but + # `multi_modal_data` will be None. + multi_modal_data=seq_group.multi_modal_data + if scheduler_outputs.num_prefill_groups > 0 else None, + prompt_adapter_request=seq_group.prompt_adapter_request, ) else: # Delta is used only for spmd workers. seq_data_delta = {} for id, data in seq_data.items(): seq_data_delta[id] = data.get_delta() - - seq_group_metadata = SequenceGroupMetadataDecode( + seq_group_metadata = SequenceGroupMetadataDelta( seq_data_delta, seq_group.request_id, block_tables, + is_prompt, do_sample=do_sample, token_chunk_size=token_chunk_size, + computed_block_nums=common_computed_block_nums, ) seq_group_metadata_list.append(seq_group_metadata) diff --git a/vllm/executor/ray_gpu_executor.py b/vllm/executor/ray_gpu_executor.py index b11699a38b5..3ecc13ecd3a 100644 --- a/vllm/executor/ray_gpu_executor.py +++ b/vllm/executor/ray_gpu_executor.py @@ -3,13 +3,9 @@ from collections import defaultdict from itertools import islice, repeat from typing import TYPE_CHECKING, Any, Dict, List, Optional, Tuple -import time import msgspec -import pickle from array import array -from regex import P - import vllm.envs as envs from vllm.executor.distributed_gpu_executor import ( # yapf: disable DistributedGPUExecutor, DistributedGPUExecutorAsync) @@ -68,14 +64,14 @@ def _init_executor(self) -> None: # Create the parallel GPU workers. self._init_workers_ray(placement_group) - self.forward_dag: Optional["ray.dag.CompiledDAG"] = None - def enc_hook(obj: Any) -> Any: if isinstance(obj, array): # convert the complex to a tuple of real, imag return obj.tobytes() - self.encoder = msgspec.msgpack.Encoder(enc_hook=enc_hook) + self.input_encoder = msgspec.msgpack.Encoder(enc_hook=enc_hook) + self.output_decoder = msgspec.msgpack.Decoder( + Optional[List[SamplerOutput]]) def _configure_ray_workers_use_nsight(self, ray_remote_kwargs) -> Dict[str, Any]: @@ -290,22 +286,26 @@ def execute_model( if self.forward_dag is None: self.forward_dag = self._compiled_ray_dag(enable_asyncio=False) - s = time.time() + # s = time.time() + # import pickle # serialized_data = pickle.dumps(execute_model_req) - serialized_data = self.encoder.encode(execute_model_req) + serialized_data = self.input_encoder.encode(execute_model_req) # # Open a file in binary write mode - import sys - if sys.getsizeof(serialized_data) > 60000: - with open('example.bin', 'wb') as file: - # Write bytes to the file - file.write(serialized_data) + # import sys + # if sys.getsizeof(serialized_data) > 60000: + # with open('example.bin', 'wb') as file: + # # Write bytes to the file + # file.write(serialized_data) - # print(f"SANG-TODO input serialization takes {(time.time() - s) * 1000} ms index: {self.i}") + # print("SANG-TODO input serialization takes " + # f"{(time.time() - s) * 1000} ms index: {self.i}") outputs = ray.get(self.forward_dag.execute(serialized_data)) - output = pickle.loads(outputs[0]) - # print(f"SANG-TODO e2e takes {(time.time() - s) * 1000} ms index: {self.i}") + # output = pickle.loads(outputs[0]) + output = self.output_decoder.decode(outputs[0]) + # print(f"SANG-TODO e2e takes {(time.time() - s) * 1000} " + # f"ms index: {self.i}") self.i += 1 return output diff --git a/vllm/executor/ray_utils.py b/vllm/executor/ray_utils.py index 67f3bb69723..0ca28e97394 100644 --- a/vllm/executor/ray_utils.py +++ b/vllm/executor/ray_utils.py @@ -1,6 +1,4 @@ from typing import List, Optional, Tuple, Type, Any -import time -import pickle import msgspec from array import array @@ -30,12 +28,13 @@ def __init__(self, *args, **kwargs) -> None: def dec_hook(type: Type, obj: Any) -> Any: if type is array: - deserialized = array('l') + deserialized = array('I') deserialized.frombytes(obj) return deserialized - self.decoder = msgspec.msgpack.Decoder(ExecuteModelRequest, - dec_hook=dec_hook) + self.input_decoder = msgspec.msgpack.Decoder(ExecuteModelRequest, + dec_hook=dec_hook) + self.output_encoder = msgspec.msgpack.Encoder() def get_node_ip(self) -> str: return get_ip() @@ -45,15 +44,18 @@ def get_node_and_gpu_ids(self) -> Tuple[str, List[int]]: gpu_ids = ray.get_gpu_ids() return node_id, gpu_ids - def execute_model_spmd(self, execute_model_req: bytes): + def execute_model_spmd(self, serialized_execute_model_req: bytes): """Used only when SPMD worker and compiled DAG are both enabled.""" - s = time.time() - - execute_model_req: ExecuteModelRequest = self.decoder.decode( - execute_model_req) - # execute_model_req: ExecuteModelRequest = pickle.loads(execute_model_req) - # print(f"SANG-TODO input deserialization takes {(time.time() - s) * 1000} ms index: {self.i}") + # s = time.time() + + execute_model_req: ExecuteModelRequest = self.input_decoder.decode( + serialized_execute_model_req) + # import pickle + # execute_model_req: ExecuteModelRequest = ( + # pickle.loads(execute_model_req)) + # print("SANG-TODO input deserialization takes " + # f"{(time.time() - s) * 1000} ms index: {self.i}") # TODO(swang): This is needed right now because Ray aDAG executes # on a background thread, so we need to reset torch's current # device. @@ -61,9 +63,12 @@ def execute_model_spmd(self, execute_model_req: bytes): if not self.compiled_dag_cuda_device_set: torch.cuda.set_device(self.worker.device) self.compiled_dag_cuda_device_set = True + output = self.worker._execute_model_spmd(execute_model_req) - output = pickle.dumps(output) - # print(f"SANG-TODO worker takes {(time.time() - s) * 1000} ms index: {self.i}") + # output = pickle.dumps(output) + output = self.output_encoder.encode(output) + # print("SANG-TODO worker takes " + # f"{(time.time() - s) * 1000} ms index: {self.i}") self.i += 1 return output diff --git a/vllm/inputs/registry.py b/vllm/inputs/registry.py index 4a7e5c58329..287cbe336fa 100644 --- a/vllm/inputs/registry.py +++ b/vllm/inputs/registry.py @@ -1,3 +1,4 @@ +from array import array import functools from dataclasses import dataclass from typing import (TYPE_CHECKING, Callable, Dict, Optional, Tuple, Type, @@ -106,7 +107,7 @@ def _default_dummy_data_factory( # Avoid circular import from vllm.sequence import SequenceData - dummy_seq_data = SequenceData([0] * seq_len) + dummy_seq_data = SequenceData(array("I", [0] * seq_len)) dummy_multi_modal_data = None return dummy_seq_data, dummy_multi_modal_data diff --git a/vllm/lora/request.py b/vllm/lora/request.py index 002cceae475..94577bf0235 100644 --- a/vllm/lora/request.py +++ b/vllm/lora/request.py @@ -1,12 +1,14 @@ import warnings -from dataclasses import dataclass, field from typing import Optional from vllm.adapter_commons.request import AdapterRequest import msgspec -class LoRARequest(msgspec.Struct, AdapterRequest): +class LoRARequest(msgspec.Struct, + AdapterRequest, + omit_defaults=True, + array_like=True): """ Request for a LoRA adapter. diff --git a/vllm/model_executor/models/blip.py b/vllm/model_executor/models/blip.py index 0b124d5e8a8..df750d8dc73 100644 --- a/vllm/model_executor/models/blip.py +++ b/vllm/model_executor/models/blip.py @@ -1,7 +1,7 @@ """Minimal implementation of BlipVisionModel intended to be only used within a vision language model.""" from typing import Optional, Union - +from array import array import torch import torch.nn as nn from PIL import Image @@ -55,7 +55,7 @@ def dummy_seq_data_for_blip( token_ids = [image_token_id] * image_feature_size token_ids += [0] * (seq_len - image_feature_size) - return SequenceData(token_ids) + return SequenceData(array("I", token_ids)) def dummy_image_for_blip( diff --git a/vllm/model_executor/models/blip2.py b/vllm/model_executor/models/blip2.py index e00e6c08069..5c0310a7d6d 100644 --- a/vllm/model_executor/models/blip2.py +++ b/vllm/model_executor/models/blip2.py @@ -1,5 +1,5 @@ from typing import Iterable, List, Literal, Optional, Tuple, TypedDict - +from array import array import torch import torch.nn as nn from transformers import (Blip2Config, Blip2QFormerConfig, Blip2VisionConfig, @@ -411,7 +411,7 @@ def dummy_data_for_blip2(ctx: InputContext, seq_len: int): image_feature_size = get_blip2_image_feature_size(hf_config) token_ids = [BLIP2_IMAGE_TOKEN_ID] * image_feature_size token_ids += [0] * (seq_len - image_feature_size) - seq_data = SequenceData(token_ids) + seq_data = SequenceData(array("I", token_ids)) if isinstance(vision_config, Blip2VisionConfig): mm_data = dummy_image_for_blip(vision_config) diff --git a/vllm/model_executor/models/chameleon.py b/vllm/model_executor/models/chameleon.py index 7659f598bab..8562d588cbb 100644 --- a/vllm/model_executor/models/chameleon.py +++ b/vllm/model_executor/models/chameleon.py @@ -1,7 +1,7 @@ from functools import cached_property from typing import (Any, Dict, Iterable, List, Literal, Optional, Tuple, TypedDict) - +from array import array import torch import torch.nn.functional as F from PIL import Image @@ -70,7 +70,7 @@ def dummy_seq_data_for_chameleon( token_ids = [image_token_id] * image_feature_size token_ids += [0] * (seq_len - image_feature_size) - return SequenceData(token_ids) + return SequenceData(array("I", token_ids)) def dummy_image_for_chameleon( diff --git a/vllm/model_executor/models/clip.py b/vllm/model_executor/models/clip.py index b4f628061f1..fb4b546b3fa 100644 --- a/vllm/model_executor/models/clip.py +++ b/vllm/model_executor/models/clip.py @@ -1,7 +1,7 @@ """Minimal implementation of CLIPVisionModel intended to be only used within a vision language model.""" from typing import Optional - +from array import array import torch import torch.nn as nn from PIL import Image @@ -53,7 +53,7 @@ def dummy_seq_data_for_clip( token_ids = [image_token_id] * image_feature_size token_ids += [0] * (seq_len - image_feature_size) - return SequenceData(token_ids) + return SequenceData(array("I", token_ids)) def dummy_image_for_clip( diff --git a/vllm/model_executor/models/fuyu.py b/vllm/model_executor/models/fuyu.py index fdea8ee30ce..d843185e1fd 100644 --- a/vllm/model_executor/models/fuyu.py +++ b/vllm/model_executor/models/fuyu.py @@ -17,7 +17,7 @@ """ PyTorch Fuyu model.""" import math from typing import Iterable, List, Literal, Optional, Tuple, TypedDict - +from array import array import torch import torch.nn as nn import torch.utils.checkpoint @@ -100,7 +100,7 @@ def dummy_seq_data_for_fuyu(ctx: InputContext, seq_len: int): token_ids = ([_IMAGE_TOKEN_ID] * ncol + [_NEWLINE_TOKEN_ID]) * nrow token_ids += [0] * (seq_len - image_feature_size) - return SequenceData(token_ids) + return SequenceData(array("I", token_ids)) def dummy_image_for_fuyu( diff --git a/vllm/model_executor/models/minicpmv.py b/vllm/model_executor/models/minicpmv.py index 8563216d9c3..9664e40adfc 100644 --- a/vllm/model_executor/models/minicpmv.py +++ b/vllm/model_executor/models/minicpmv.py @@ -25,7 +25,7 @@ import re from functools import partial from typing import Iterable, List, Optional, Tuple - +from array import array import numpy as np import torch import torch.nn.functional as F @@ -319,7 +319,7 @@ def get_max_minicpmv_image_tokens(ctx: InputContext): def dummy_seq_data_for_minicpmv(seq_len: int): token_ids = [0] * seq_len - return SequenceData(token_ids) + return SequenceData(array("I", token_ids)) def dummy_image_for_minicpmv(hf_config): diff --git a/vllm/model_executor/models/paligemma.py b/vllm/model_executor/models/paligemma.py index 2af48b6bc19..d74d754c607 100644 --- a/vllm/model_executor/models/paligemma.py +++ b/vllm/model_executor/models/paligemma.py @@ -1,5 +1,5 @@ from typing import Iterable, List, Literal, Optional, Tuple, TypedDict - +from array import array import torch from PIL import Image from torch import nn @@ -51,7 +51,7 @@ def dummy_seq_data_for_paligemma( token_ids = [image_token_id] * image_feature_size token_ids += [0] * (seq_len - image_feature_size) - return SequenceData(token_ids) + return SequenceData(array("I", token_ids)) def dummy_image_for_paligemma( diff --git a/vllm/model_executor/sampling_metadata.py b/vllm/model_executor/sampling_metadata.py index 59cfec9ec89..2d187fa0950 100644 --- a/vllm/model_executor/sampling_metadata.py +++ b/vllm/model_executor/sampling_metadata.py @@ -440,9 +440,9 @@ def from_sampling_metadata( and sampling_params.prompt_logprobs is not None): prefill_len = len(seq_group.prompt_logprob_indices) prompt_tokens.extend( - array('l') for _ in range(prefill_len)) + array('I') for _ in range(prefill_len)) output_tokens.extend( - array('l') for _ in range(prefill_len)) + array('I') for _ in range(prefill_len)) if seq_group.do_sample: for seq_id in seq_ids: seq_data = seq_group.seq_data[seq_id] diff --git a/vllm/pooling_params.py b/vllm/pooling_params.py index f015504e5b4..650b815ba51 100644 --- a/vllm/pooling_params.py +++ b/vllm/pooling_params.py @@ -2,7 +2,7 @@ import msgspec -class PoolingParams(msgspec.Struct, omit_defaults=True): +class PoolingParams(msgspec.Struct, omit_defaults=True, array_like=True): """Pooling parameters for pooling. Attributes: diff --git a/vllm/prompt_adapter/request.py b/vllm/prompt_adapter/request.py index a7e71a75f6c..a2d0a74b576 100644 --- a/vllm/prompt_adapter/request.py +++ b/vllm/prompt_adapter/request.py @@ -1,10 +1,11 @@ -from dataclasses import dataclass - from vllm.adapter_commons.request import AdapterRequest import msgspec -class PromptAdapterRequest(msgspec.Struct, AdapterRequest, array_like=True): +class PromptAdapterRequest(msgspec.Struct, + AdapterRequest, + array_like=True, + omit_defaults=True): """ Request for a Prompt adapter. """ diff --git a/vllm/sampling_params.py b/vllm/sampling_params.py index f7a24e76687..5f0a7fa4515 100644 --- a/vllm/sampling_params.py +++ b/vllm/sampling_params.py @@ -1,11 +1,9 @@ """Sampling parameters for text generation.""" import copy from enum import IntEnum -from functools import cached_property from typing import Any, Callable, Dict, List, Optional, Union, Set import torch -from pydantic import Field from typing_extensions import Annotated import msgspec @@ -33,7 +31,7 @@ class SamplingType(IntEnum): to sample from.""" -class SamplingParams(msgspec.Struct, omit_defaults=True, array_like=False): +class SamplingParams(msgspec.Struct, omit_defaults=True): """Sampling parameters for text generation. Overall, we follow the sampling parameters from the OpenAI text completion @@ -138,7 +136,9 @@ class SamplingParams(msgspec.Struct, omit_defaults=True, array_like=False): detokenize: bool = True skip_special_tokens: bool = True spaces_between_special_tokens: bool = True - # logits_processors: Optional[List[LogitsProcessor]] = None + # Optional[List[LogitsProcessor]] type. We use Any here because + # Optional[List[LogitsProcessor]] type is not supported by msgspec. + # We will also remove this API soon. logits_processors: Optional[Any] = None include_stop_str_in_output: bool = False truncate_prompt_tokens: Optional[Annotated[int, msgspec.Meta(ge=1)]] = None @@ -146,7 +146,7 @@ class SamplingParams(msgspec.Struct, omit_defaults=True, array_like=False): # The below fields are not supposed to be used as an input. # They are set in post_init. output_text_buffer_length: int = 0 - all_stop_token_ids: Optional[Set[int]] = None + _all_stop_token_ids: Set[int] = msgspec.field(default_factory=set) def __post_init__(self) -> None: self.best_of = self.best_of or self.n @@ -182,11 +182,12 @@ def __post_init__(self) -> None: self.min_p = 0.0 self._verify_greedy_sampling() # eos_token_id is added to this by the engine - self.all_stop_token_ids = set(self.stop_token_ids) + self._all_stop_token_ids = set(self.stop_token_ids) def _verify_args(self) -> None: if self.n < 1: raise ValueError(f"n must be at least 1, got {self.n}.") + assert isinstance(self.best_of, int) if self.best_of < self.n: raise ValueError(f"best_of must be greater than or equal to n, " f"got n={self.n} and best_of={self.best_of}.") @@ -230,6 +231,7 @@ def _verify_args(self) -> None: and self.truncate_prompt_tokens < 1): raise ValueError(f"truncate_prompt_tokens must be >= 1, " f"got {self.truncate_prompt_tokens}") + assert isinstance(self.stop, list) if any(not stop_str for stop_str in self.stop): raise ValueError("stop cannot contain an empty string.") if self.stop and not self.detokenize: @@ -263,6 +265,7 @@ def _verify_non_beam_search(self) -> None: "default value of 1.0 when not using beam search.") def _verify_greedy_sampling(self) -> None: + assert isinstance(self.best_of, int) if self.best_of > 1: raise ValueError("best_of must be 1 when using greedy sampling." f"Got {self.best_of}.") @@ -276,7 +279,7 @@ def update_from_generation_config( if model_eos_token_id is not None: # Add the eos token id into the sampling_params to support # min_tokens processing. - self.all_stop_token_ids.add(model_eos_token_id) + self._all_stop_token_ids.add(model_eos_token_id) # Update eos_token_id for generation if (eos_ids := generation_config.get("eos_token_id")) is not None: @@ -288,7 +291,7 @@ def update_from_generation_config( # purposes. eos_ids.discard(model_eos_token_id) if eos_ids: - self.all_stop_token_ids.update(eos_ids) + self._all_stop_token_ids.update(eos_ids) if not self.ignore_eos: eos_ids.update(self.stop_token_ids) self.stop_token_ids = list(eos_ids) @@ -303,6 +306,10 @@ def sampling_type(self) -> SamplingType: return SamplingType.RANDOM_SEED return SamplingType.RANDOM + @property + def all_stop_token_ids(self) -> Set[int]: + return self._all_stop_token_ids + def clone(self) -> "SamplingParams": """Deep copy excluding LogitsProcessor objects. diff --git a/vllm/sequence.py b/vllm/sequence.py index 112e2d4ce98..372a6ca1555 100644 --- a/vllm/sequence.py +++ b/vllm/sequence.py @@ -5,7 +5,7 @@ from abc import ABC, abstractmethod from array import array from collections import defaultdict -from dataclasses import dataclass, field +from dataclasses import dataclass from typing import (TYPE_CHECKING, Dict, List, Mapping, Optional, Set, Tuple, Union, Any) @@ -16,15 +16,14 @@ from vllm.pooling_params import PoolingParams from vllm.prompt_adapter.request import PromptAdapterRequest from vllm.sampling_params import SamplingParams +from vllm.multimodal.base import MultiModalDataDict +from vllm.spec_decode.metrics import SpecDecodeWorkerMetrics if TYPE_CHECKING: from vllm.inputs import LLMInputs - from vllm.multimodal.base import MultiModalDataDict - from vllm.spec_decode.metrics import SpecDecodeWorkerMetrics -@dataclass -class Logprob: +class Logprob(msgspec.Struct, omit_defaults=True, array_like=True): """Infos for supporting OpenAI compatible logprobs and token ranks. Attributes: @@ -123,33 +122,36 @@ class SequenceData(msgspec.Struct, omit_defaults=True): cumulative_logprob: The cumulative log probability of the output. """ _prompt_token_ids: array - _output_token_ids: Optional[array] = None + _output_token_ids: array = msgspec.field( + default_factory=lambda: array("I", [])) ### The below fields should not be passed as an argument ### - cumulative_logprob: float = 0.0 - _prompt_token_ids_tuple: Optional[Tuple[int, ...]] = None + _cumulative_logprob: float = 0.0 + _prompt_token_ids_tuple: Tuple[int, + ...] = msgspec.field(default_factory=tuple) # The number of tokens that are computed (that run against the model). _num_computed_tokens: int = 0 _stage: SequenceStage = SequenceStage.PREFILL - # New output tokens appended. Used to get delta input. + + # Used to get delta input. _new_appended_tokens: List[int] = msgspec.field(default_factory=list) _cached_all_token_ids: List[int] = msgspec.field(default_factory=list) def __post_init__(self, ) -> None: - if not isinstance(self._prompt_token_ids, array): - self._prompt_token_ids = array('l', self._prompt_token_ids) - if not isinstance(self._output_token_ids, array): - self._output_token_ids = array( - 'l', self._output_token_ids - if self._output_token_ids is not None else []) self._prompt_token_ids_tuple: Tuple[int, ...] = tuple( self._prompt_token_ids) self._update_cached_all_tokens() def _update_cached_all_tokens(self): + assert isinstance(self._prompt_token_ids, array) + assert isinstance(self._output_token_ids, array) self._cached_all_token_ids: List[int] = list(self._prompt_token_ids + self._output_token_ids) + @property + def cumulative_logprob(self) -> float: + return self._cumulative_logprob + @property def prompt_token_ids(self) -> Tuple[int, ...]: return self._prompt_token_ids_tuple @@ -167,19 +169,20 @@ def output_token_ids(self) -> Tuple[int, ...]: return tuple(self._output_token_ids) @output_token_ids.setter - def output_token_ids(self, new_output_token_ids) -> None: - self._output_token_ids = array('l', new_output_token_ids) + def output_token_ids(self, new_output_token_ids: List[int]) -> None: + self._output_token_ids = array('I', new_output_token_ids) self._update_cached_all_tokens() @property def output_token_ids_array(self) -> array: + assert isinstance(self._output_token_ids, array) return self._output_token_ids def append_token_id(self, token_id: int, logprob: float) -> None: self._output_token_ids.append(token_id) self._new_appended_tokens.append(token_id) self._cached_all_token_ids.append(token_id) - self.cumulative_logprob += logprob + self._cumulative_logprob += logprob def get_len(self) -> int: return len(self._output_token_ids) + len(self._prompt_token_ids) @@ -238,14 +241,14 @@ def get_last_token_id(self) -> int: return self._output_token_ids[-1] def get_prompt_token_ids(self) -> Tuple[int, ...]: - return self._prompt_token_ids + return self.prompt_token_ids def get_output_token_ids(self) -> Tuple[int, ...]: - return self._output_token_ids + return self.output_token_ids def get_delta(self) -> SequenceDataDelta: delta = SequenceDataDelta(self._new_appended_tokens, - self.cumulative_logprob, + self._cumulative_logprob, self.get_num_computed_tokens(), self.stage) # Reset delta state. self._new_appended_tokens = [] @@ -253,7 +256,7 @@ def get_delta(self) -> SequenceDataDelta: def apply_delta(self, delta: SequenceDataDelta): self._num_computed_tokens = delta.new_num_computed_tokens - self.cumulative_logprob = delta.new_cumulative_logprob + self._cumulative_logprob = delta.new_cumulative_logprob self._stage = delta.new_stage self._output_token_ids.extend(delta.new_output_token_ids) self._cached_all_token_ids.extend(delta.new_output_token_ids) @@ -299,7 +302,7 @@ def __init__( self.lora_request = lora_request self.prompt_adapter_request = prompt_adapter_request - self.data = SequenceData(self.prompt_token_ids) + self.data = SequenceData(array("I", self.prompt_token_ids)) self.output_logprobs: SampleLogprobs = [] self.output_text = "" @@ -561,14 +564,19 @@ def get_max_num_running_seqs(self) -> int: if self.sampling_params and self.sampling_params.use_beam_search: # For beam search, maximally there will always be `best_of` beam # candidates running in the future. - return self.sampling_params.best_of + best_of = self.sampling_params.best_of + assert isinstance(best_of, int) + return best_of else: - if (self.sampling_params - and self.sampling_params.best_of > self.num_seqs()): - # At prompt stage, the sequence group is not yet filled up - # and only have one sequence running. However, in the - # generation stage, we will have `best_of` sequences running. - return self.sampling_params.best_of + if self.sampling_params: + best_of = self.sampling_params.best_of + assert isinstance(best_of, int) + if best_of > self.num_seqs(): + # At prompt stage, the sequence group is not yet filled up + # and only have one sequence running. However, in the + # generation stage, we will have `best_of` sequences + # running. + return best_of # At sampling stages, return the number of actual sequences # that are not finished yet. return self.num_unfinished_seqs() @@ -650,16 +658,18 @@ def __repr__(self) -> str: f"num_seqs={len(self.seqs_dict)})") -class SequenceGroupMetadataDecode(msgspec.Struct, - tag=True, - array_like=True, - omit_defaults=True): +class SequenceGroupMetadataDelta(msgspec.Struct, + tag=True, + array_like=True, + omit_defaults=True): """Delta sequence group metadata.""" seq_data_delta: Dict[int, SequenceDataDelta] request_id: str block_tables: Dict[int, List[int]] + is_prompt: bool do_sample: bool = True token_chunk_size: Optional[int] = None + computed_block_nums: Optional[List[int]] = None class SequenceGroupMetadata(msgspec.Struct, @@ -742,21 +752,18 @@ def prompt_adapter_num_virtual_tokens(self) -> int: return self.prompt_adapter_request.prompt_adapter_num_virtual_tokens \ if self.prompt_adapter_request else 0 - def apply_delta( - self, sequence_group_metadata_decode: SequenceGroupMetadataDecode): - for id, delta in sequence_group_metadata_decode.seq_data_delta.items(): + def apply_delta(self, + sequence_group_metadata_delta: SequenceGroupMetadataDelta): + for id, delta in sequence_group_metadata_delta.seq_data_delta.items(): self.seq_data[id].apply_delta(delta) - self.request_id = sequence_group_metadata_decode.request_id - # for seq_id, block_table in sequence_group_metadata_decode.block_tables.items(): - # if len(block_table) > 0: - # self.block_tables[seq_id].append(block_table[0]) - self.block_tables = sequence_group_metadata_decode.block_tables - self.token_chunk_size = sequence_group_metadata_decode.token_chunk_size - self.do_sample = sequence_group_metadata_decode.do_sample - self.is_prompt = False + self.request_id = sequence_group_metadata_delta.request_id + self.block_tables = sequence_group_metadata_delta.block_tables + self.token_chunk_size = sequence_group_metadata_delta.token_chunk_size + self.do_sample = sequence_group_metadata_delta.do_sample + self.is_prompt = sequence_group_metadata_delta.is_prompt -class SequenceOutput: +class SequenceOutput(msgspec.Struct, omit_defaults=True, array_like=True): """The model output associated with a sequence. Args: @@ -766,16 +773,9 @@ class SequenceOutput: logprobs: The logprobs of the output token. (Token id -> logP(x_i+1 | x_0, ..., x_i)) """ - - def __init__( - self, - parent_seq_id: int, - output_token: int, - logprobs: Dict[int, Logprob], - ) -> None: - self.parent_seq_id = parent_seq_id - self.output_token = output_token - self.logprobs = logprobs + parent_seq_id: int + output_token: int + logprobs: Dict[int, Logprob] def __repr__(self) -> str: return (f"SequenceOutput(parent_seq_id={self.parent_seq_id}, " @@ -803,17 +803,14 @@ def __eq__(self, other: object) -> bool: pass -class CompletionSequenceGroupOutput(SequenceGroupOutput): +class CompletionSequenceGroupOutput(msgspec.Struct, + omit_defaults=True, + array_like=True): + __metaclass__ = SequenceGroupOutput """The model output associated with a completion sequence group.""" - - def __init__( - self, - samples: List[SequenceOutput], - prompt_logprobs: Optional[PromptLogprobs], - ) -> None: - self.samples = samples - # Prompt logprob for each prompt query token. - self.prompt_logprobs = prompt_logprobs + samples: List[SequenceOutput] + # Prompt logprob for each prompt query token. + prompt_logprobs: Optional[PromptLogprobs] def __repr__(self) -> str: return (f"CompletionSequenceGroupOutput(samples={self.samples}, " @@ -826,14 +823,14 @@ def __eq__(self, other: object) -> bool: and self.prompt_logprobs == other.prompt_logprobs) -class EmbeddingSequenceGroupOutput(SequenceGroupOutput): +class EmbeddingSequenceGroupOutput( + msgspec.Struct, + omit_defaults=True, + array_like=True, +): """The model output associated with an embedding sequence group.""" - - def __init__( - self, - embeddings: List[float], - ) -> None: - self.embeddings = embeddings + __metaclass__ = SequenceGroupOutput + embeddings: List[int] def __repr__(self) -> str: return (f"EmbeddingSequenceGroupOutput(" @@ -845,8 +842,7 @@ def __eq__(self, other: object) -> bool: return self.embeddings == other.embeddings -@dataclass -class IntermediateTensors: +class IntermediateTensors(msgspec.Struct, omit_defaults=True, array_like=True): """For all pipeline stages except the last, we need to return the hidden states and residuals to be sent to the next stage. This data structure contains the hidden states and residuals for a request. @@ -873,8 +869,7 @@ def __repr__(self) -> str: return f"IntermediateTensors(tensors={self.tensors})" -@dataclass -class SamplerOutput: +class SamplerOutput(msgspec.Struct, omit_defaults=True, array_like=True): """For each sequence group, we generate a list of SequenceOutput object, each of which contains one possible candidate for the next token. @@ -894,7 +889,7 @@ class SamplerOutput: sampled_token_ids: Optional[torch.Tensor] = None # Spec decode metrics populated by workers. - spec_decode_worker_metrics: Optional["SpecDecodeWorkerMetrics"] = None + spec_decode_worker_metrics: Optional[SpecDecodeWorkerMetrics] = None # Optional last hidden states from the model. hidden_states: Optional[torch.Tensor] = None @@ -926,12 +921,11 @@ def __repr__(self) -> str: f"spec_decode_worker_metrics={self.spec_decode_worker_metrics})") -@dataclass -class PoolerOutput: +class PoolerOutput(msgspec.Struct, omit_defaults=True, array_like=True): """The output from a pooling operation in the embedding model.""" outputs: List[EmbeddingSequenceGroupOutput] - spec_decode_worker_metrics: Optional["SpecDecodeWorkerMetrics"] = None + spec_decode_worker_metrics: Optional[SpecDecodeWorkerMetrics] = None def __getitem__(self, idx: int): return self.outputs[idx] @@ -978,28 +972,34 @@ class HiddenStates(msgspec.Struct, array_like=True, omit_defaults=True): seq_ids are the sequence ids of each entry of the batch dimension of the hidden_states tensor""" - seq_ids: List[int] + seq_group_metadata_list: List[SequenceGroupMetadata] hidden_states: torch.Tensor + _seq_ids: List[int] = msgspec.field(default_factory=list) def __post_init__(self): + self._seq_ids = get_all_seq_ids(self.seq_group_metadata_list) assert len(self.seq_group_metadata_list) == len(self.hidden_states) + @property + def seq_ids(self) -> List[int]: + return self._seq_ids + def update(self, seq_group_metadata_list: List[SequenceGroupMetadata], hidden_states: torch.Tensor) -> None: """Update hidden states from target model invocation.""" assert len(seq_group_metadata_list) == len(hidden_states) - self.seq_ids.extend(get_all_seq_ids(seq_group_metadata_list)) + self._seq_ids.extend(get_all_seq_ids(seq_group_metadata_list)) self.hidden_states = torch.cat([self.hidden_states, hidden_states]) def prune(self, seq_group_metadata_list: List[SequenceGroupMetadata]) -> None: """Prune to provided list of sequence ids.""" seq_ids = get_all_seq_ids(seq_group_metadata_list) - if seq_ids != self.seq_ids: + if seq_ids != self._seq_ids: # Batch contents changed - prune removed sequences. - index = [self.seq_ids.index(seq_id) for seq_id in seq_ids] + index = [self._seq_ids.index(seq_id) for seq_id in seq_ids] self.hidden_states = self.hidden_states[index] - self.seq_ids = seq_ids + self._seq_ids = seq_ids class ExecuteModelRequest(msgspec.Struct, array_like=True, omit_defaults=True): @@ -1007,7 +1007,7 @@ class ExecuteModelRequest(msgspec.Struct, array_like=True, omit_defaults=True): engine should create an instance of this class for each request batch.""" # The sequence group metadata list. seq_group_metadata_list: List[Union[SequenceGroupMetadata, - SequenceGroupMetadataDecode]] + SequenceGroupMetadataDelta]] # Blocks to swap in. List of CPU -> GPU block number. blocks_to_swap_in: List[Tuple[int, int]] = msgspec.field(default_factory=list) @@ -1030,7 +1030,8 @@ class ExecuteModelRequest(msgspec.Struct, array_like=True, omit_defaults=True): finished_requests_ids: List[str] = msgspec.field(default_factory=list) def clone( - self, seq_group_metadata_list: List[SequenceGroupMetadata] + self, seq_group_metadata_list: List[Union[SequenceGroupMetadata, + SequenceGroupMetadataDelta]] ) -> "ExecuteModelRequest": """Clone the request with a new sequence group metadata list.""" return ExecuteModelRequest( diff --git a/vllm/spec_decode/batch_expansion.py b/vllm/spec_decode/batch_expansion.py index 5d0934a8aea..5aed0ab992f 100644 --- a/vllm/spec_decode/batch_expansion.py +++ b/vllm/spec_decode/batch_expansion.py @@ -1,6 +1,6 @@ from itertools import chain, count from typing import Iterator, List, Tuple - +from array import array import torch from vllm import SamplingParams @@ -299,8 +299,8 @@ def _create_single_target_seq_group_metadata( new_seq_data_dict = { target_seq_id: SequenceData( - prompt_token_ids=prompt_token_ids, - _output_token_ids=new_output_token_ids, + array("I", prompt_token_ids), + _output_token_ids=array("I", new_output_token_ids), ), } # This is a hack. Technically, spec decoding should compute diff --git a/vllm/spec_decode/metrics.py b/vllm/spec_decode/metrics.py index 9036d117041..1029c855705 100644 --- a/vllm/spec_decode/metrics.py +++ b/vllm/spec_decode/metrics.py @@ -1,6 +1,6 @@ import time -from dataclasses import dataclass from typing import Callable, Optional +import msgspec import torch @@ -9,8 +9,9 @@ from vllm.utils import is_pin_memory_available -@dataclass -class SpecDecodeWorkerMetrics: +class SpecDecodeWorkerMetrics(msgspec.Struct, + omit_defaults=True, + array_like=True): """Dataclass holding metrics emitted from the spec decode worker. """ diff --git a/vllm/spec_decode/spec_decode_worker.py b/vllm/spec_decode/spec_decode_worker.py index 91674427dbd..ad8c0cee0b5 100644 --- a/vllm/spec_decode/spec_decode_worker.py +++ b/vllm/spec_decode/spec_decode_worker.py @@ -465,8 +465,7 @@ def _run_no_spec(self, execute_model_req: ExecuteModelRequest, if hidden_states is not None: if self.previous_hidden_states is None: self.previous_hidden_states = HiddenStates( - get_all_seq_ids(execute_model_req.seq_group_metadata_list), - hidden_states) + execute_model_req.seq_group_metadata_list, hidden_states) else: self.previous_hidden_states.update( execute_model_req.seq_group_metadata_list, hidden_states) @@ -635,8 +634,8 @@ def _verify_tokens( index = accepted_index[:, None, None].expand(-1, 1, hs_size) hidden_states = hidden_states.gather(1, index).squeeze(1) # b x d # Store hidden states from target model for subsequent decode step - self.previous_hidden_states = HiddenStates( - get_all_seq_ids(seq_group_metadata_list), hidden_states) + self.previous_hidden_states = HiddenStates(seq_group_metadata_list, + hidden_states) return accepted_token_ids, logprobs diff --git a/vllm/worker/model_runner.py b/vllm/worker/model_runner.py index 39816aadb81..5a864495e5f 100644 --- a/vllm/worker/model_runner.py +++ b/vllm/worker/model_runner.py @@ -914,9 +914,9 @@ def profile_run(self) -> None: seq_data={group_id: seq_data}, sampling_params=sampling_params, block_tables=None, - # lora_request=dummy_lora_requests_per_seq[group_id] - # if dummy_lora_requests_per_seq else None, - # multi_modal_data=dummy_multi_modal_data, + lora_request=dummy_lora_requests_per_seq[group_id] + if dummy_lora_requests_per_seq else None, + multi_modal_data=dummy_multi_modal_data, ) seqs.append(seq) diff --git a/vllm/worker/worker.py b/vllm/worker/worker.py index 80b73305152..ed1437b6513 100644 --- a/vllm/worker/worker.py +++ b/vllm/worker/worker.py @@ -19,7 +19,7 @@ from vllm.platforms import current_platform from vllm.prompt_adapter.request import PromptAdapterRequest from vllm.sequence import (ExecuteModelRequest, SamplerOutput, - SequenceGroupMetadata, SequenceGroupMetadataDecode) + SequenceGroupMetadata, SequenceGroupMetadataDelta) from vllm.worker.cache_engine import CacheEngine from vllm.worker.embedding_model_runner import EmbeddingModelRunner from vllm.worker.model_runner import GPUModelRunnerBase, ModelRunner @@ -303,14 +303,15 @@ def _get_cached_seq_group_metadata(self, seq_group_metadata_list): assert isinstance(metadata_or_delta, SequenceGroupMetadata) self._seq_group_metadata_cache[request_id] = metadata_or_delta else: - if isinstance(metadata_or_delta, SequenceGroupMetadataDecode): + if isinstance(metadata_or_delta, SequenceGroupMetadataDelta): self._seq_group_metadata_cache[request_id].apply_delta( metadata_or_delta) else: - # If metadata snapshot is sent again, it is either preempted, - # or chunked prefill. + # If metadata snapshot is sent again, it is either + # preempted, or chunked prefill. Reset the cache. assert isinstance(metadata_or_delta, SequenceGroupMetadata) - self._seq_group_metadata_cache[request_id] = metadata_or_delta + self._seq_group_metadata_cache[ + request_id] = metadata_or_delta new_seq_group_metadata_list.append( self._seq_group_metadata_cache[request_id]) return new_seq_group_metadata_list From 35e963746500c3a0ae4cb116336907a8bafbd423 Mon Sep 17 00:00:00 2001 From: sang Date: Sat, 3 Aug 2024 03:01:22 -0700 Subject: [PATCH 13/36] working finally --- c.py | 21 ++++++++-- tests/distributed/test_pipeline_parallel.py | 45 +++++++++++---------- vllm/config.py | 1 - vllm/engine/llm_engine.py | 1 - vllm/executor/ray_gpu_executor.py | 6 +-- vllm/executor/ray_utils.py | 40 ++++++++++-------- vllm/lora/request.py | 6 +-- vllm/prompt_adapter/request.py | 3 +- vllm/sequence.py | 2 +- vllm/worker/worker.py | 10 +++-- 10 files changed, 77 insertions(+), 58 deletions(-) diff --git a/c.py b/c.py index 86a55fe0c44..5dae752a306 100644 --- a/c.py +++ b/c.py @@ -1,6 +1,7 @@ import time import numpy as np + class Timer: def __init__(self, msg): @@ -14,7 +15,19 @@ def __exit__(self, exc_type, exc_val, exc_tb): self.end = time.time() self.elapsed_us = (self.end - self.start) * 1000 * 1000 print(f"{self.msg=}. Elapsed time: {self.elapsed_us:.2f} us") -l = [i for i in range(4096)] -from array import array -with Timer("converesion"): - arr = array("I", l) + + +# l = [i for i in range(4096)] +# from array import array +# with Timer("converesion"): +# arr = array("I", l) + +from ray import cloudpickle as pickle +# import pickle + +bytes = b"1" * 65665 +with Timer("bytes pickling"): + data = pickle.dumps(bytes) +with Timer("bytes deser"): + pickle.loads(data) + diff --git a/tests/distributed/test_pipeline_parallel.py b/tests/distributed/test_pipeline_parallel.py index ab325e09669..431ce3ef05c 100644 --- a/tests/distributed/test_pipeline_parallel.py +++ b/tests/distributed/test_pipeline_parallel.py @@ -16,28 +16,29 @@ @pytest.mark.parametrize( ("TP_SIZE, PP_SIZE, EAGER_MODE, CHUNKED_PREFILL, " - "MODEL_NAME, DIST_BACKEND, USE_RAY_ADAG, USE_RAY_ADAG_NCCL"), [ - (2, 2, 0, 1, "meta-llama/Meta-Llama-3-8B", "ray", False, False), - (2, 2, 1, 0, "meta-llama/Meta-Llama-3-8B", "ray", False, False), - (1, 3, 0, 0, "meta-llama/Meta-Llama-3-8B", "ray", False, False), - (1, 4, 0, 1, "meta-llama/Meta-Llama-3-8B", "ray", False, False), - (1, 4, 1, 0, "meta-llama/Meta-Llama-3-8B", "ray", False, False), - (2, 2, 0, 1, "meta-llama/Meta-Llama-3-8B", "ray", True, False), - (2, 2, 1, 0, "meta-llama/Meta-Llama-3-8B", "ray", True, False), - (1, 3, 0, 0, "meta-llama/Meta-Llama-3-8B", "ray", True, False), - (1, 4, 0, 1, "meta-llama/Meta-Llama-3-8B", "ray", True, False), - (1, 4, 1, 0, "meta-llama/Meta-Llama-3-8B", "ray", True, False), - (2, 2, 0, 1, "meta-llama/Meta-Llama-3-8B", "ray", True, True), - (2, 2, 1, 0, "meta-llama/Meta-Llama-3-8B", "ray", True, True), - (1, 3, 0, 0, "meta-llama/Meta-Llama-3-8B", "ray", True, True), - (1, 4, 0, 1, "meta-llama/Meta-Llama-3-8B", "ray", True, True), - (1, 4, 1, 0, "meta-llama/Meta-Llama-3-8B", "ray", True, True), - (2, 2, 0, 1, "meta-llama/Meta-Llama-3-8B", "mp", False, False), - (2, 2, 1, 0, "meta-llama/Meta-Llama-3-8B", "mp", False, False), - (1, 3, 0, 0, "meta-llama/Meta-Llama-3-8B", "mp", False, False), - (1, 4, 0, 1, "meta-llama/Meta-Llama-3-8B", "mp", False, False), - (1, 4, 1, 0, "meta-llama/Meta-Llama-3-8B", "mp", False, False), - ]) + "MODEL_NAME, DIST_BACKEND, USE_RAY_ADAG, USE_RAY_ADAG_NCCL"), + [ + # (2, 2, 0, 1, "meta-llama/Meta-Llama-3-8B", "ray", False, False), + # (2, 2, 1, 0, "meta-llama/Meta-Llama-3-8B", "ray", False, False), + # (1, 3, 0, 0, "meta-llama/Meta-Llama-3-8B", "ray", False, False), + # (1, 4, 0, 1, "meta-llama/Meta-Llama-3-8B", "ray", False, False), + # (1, 4, 1, 0, "meta-llama/Meta-Llama-3-8B", "ray", False, False), + # (2, 2, 0, 1, "meta-llama/Meta-Llama-3-8B", "ray", True, False), + # (2, 2, 1, 0, "meta-llama/Meta-Llama-3-8B", "ray", True, False), + # (1, 3, 0, 0, "meta-llama/Meta-Llama-3-8B", "ray", True, False), + # (1, 4, 0, 1, "meta-llama/Meta-Llama-3-8B", "ray", True, False), + # (1, 4, 1, 0, "meta-llama/Meta-Llama-3-8B", "ray", True, False), + # (2, 2, 0, 1, "meta-llama/Meta-Llama-3-8B", "ray", True, True), + # (2, 2, 1, 0, "meta-llama/Meta-Llama-3-8B", "ray", True, True), + # (1, 3, 0, 0, "meta-llama/Meta-Llama-3-8B", "ray", True, True), + # (1, 4, 0, 1, "meta-llama/Meta-Llama-3-8B", "ray", True, True), + (1, 2, 1, 0, "meta-llama/Meta-Llama-3-8B", "ray", True, True), + # (2, 2, 0, 1, "meta-llama/Meta-Llama-3-8B", "mp", False, False), + # (2, 2, 1, 0, "meta-llama/Meta-Llama-3-8B", "mp", False, False), + # (1, 3, 0, 0, "meta-llama/Meta-Llama-3-8B", "mp", False, False), + # (1, 4, 0, 1, "meta-llama/Meta-Llama-3-8B", "mp", False, False), + # (1, 4, 1, 0, "meta-llama/Meta-Llama-3-8B", "mp", False, False), + ]) def test_compare_tp(TP_SIZE, PP_SIZE, EAGER_MODE, CHUNKED_PREFILL, MODEL_NAME, DIST_BACKEND, USE_RAY_ADAG, USE_RAY_ADAG_NCCL): if VLLM_MULTI_NODE and DIST_BACKEND == "mp": diff --git a/vllm/config.py b/vllm/config.py index 40e48e8245a..ef56e2b6395 100644 --- a/vllm/config.py +++ b/vllm/config.py @@ -6,7 +6,6 @@ import torch from transformers import PretrainedConfig -import vllm.envs as envs from vllm.logger import init_logger from vllm.model_executor.layers.quantization import QUANTIZATION_METHODS from vllm.model_executor.models import ModelRegistry diff --git a/vllm/engine/llm_engine.py b/vllm/engine/llm_engine.py index db9b2b191c8..b42d87a15af 100644 --- a/vllm/engine/llm_engine.py +++ b/vllm/engine/llm_engine.py @@ -899,7 +899,6 @@ def step(self) -> List[Union[RequestOutput, EmbeddingRequestOutput]]: "as performance will be severely degraded otherwise.") seq_group_metadata_list, scheduler_outputs = self.scheduler[ 0].schedule() - # print("SANG-TODO batch size,", len(seq_group_metadata_list)) if not scheduler_outputs.is_empty(): finished_requests_ids = self.scheduler[ diff --git a/vllm/executor/ray_gpu_executor.py b/vllm/executor/ray_gpu_executor.py index d58fdb05072..c234a23f0a7 100644 --- a/vllm/executor/ray_gpu_executor.py +++ b/vllm/executor/ray_gpu_executor.py @@ -126,10 +126,8 @@ def _init_workers_ray(self, placement_group: "PlacementGroup", ray_remote_kwargs = self._configure_ray_workers_use_nsight( ray_remote_kwargs) - logger.info("use_ray_spmd_worker: %s", self.use_ray_spmd_worker) # Create the workers. driver_ip = get_ip() - logger.info("driver_ip: %s", driver_ip) worker_wrapper_kwargs = self._get_worker_wrapper_args() for bundle_id, bundle in enumerate(placement_group.bundle_specs): if not bundle.get("GPU", 0): @@ -313,7 +311,7 @@ def execute_model( # import pickle # serialized_data = pickle.dumps(execute_model_req) - serialized_data = self.input_encoder.encode((execute_model_req, None)) + serialized_data = self.input_encoder.encode(execute_model_req) # # Open a file in binary write mode # import sys # if sys.getsizeof(serialized_data) > 60000: @@ -504,7 +502,7 @@ async def execute_model_async( if self.forward_dag is None: self.forward_dag = self._compiled_ray_dag(enable_asyncio=True) - serialized_data = self.input_encoder.encode((execute_model_req, None)) + serialized_data = self.input_encoder.encode(execute_model_req) dag_future = await self.forward_dag.execute_async(serialized_data) outputs = await dag_future return self.output_decoder.decode(outputs[0]) diff --git a/vllm/executor/ray_utils.py b/vllm/executor/ray_utils.py index fff8540aa80..5d9c3fb5929 100644 --- a/vllm/executor/ray_utils.py +++ b/vllm/executor/ray_utils.py @@ -1,4 +1,4 @@ -from typing import List, Optional, Tuple, Type, Any +from typing import List, Optional, Tuple, Type, Any, Union import msgspec from array import array @@ -13,8 +13,6 @@ try: import ray - ModelRequest = Tuple[ExecuteModelRequest, Optional[IntermediateTensors]] - class RayWorkerWrapper(WorkerWrapperBase): """Ray wrapper for vllm.worker.Worker, allowing Worker to be lazliy initialized after Ray sets CUDA_VISIBLE_DEVICES.""" @@ -34,9 +32,14 @@ def dec_hook(type: Type, obj: Any) -> Any: deserialized.frombytes(obj) return deserialized - self.input_decoder = msgspec.msgpack.Decoder(ModelRequest, + def enc_hook(obj: Any) -> Any: + if isinstance(obj, array): + # convert the complex to a tuple of real, imag + return obj.tobytes() + + self.input_decoder = msgspec.msgpack.Decoder(ExecuteModelRequest, dec_hook=dec_hook) - self.output_encoder = msgspec.msgpack.Encoder() + self.output_encoder = msgspec.msgpack.Encoder(enc_hook=enc_hook) def get_node_ip(self) -> str: return get_ip() @@ -47,21 +50,25 @@ def get_node_and_gpu_ids(self) -> Tuple[str, List[int]]: return node_id, gpu_ids def execute_model_spmd( - self, - serialized_model_request: bytes) -> bytes: + self, req_or_tuple: Union[bytes, Tuple[ + bytes, Optional[IntermediateTensors]]] + ) -> bytes: """Execute model in SPMD fashion: used only when SPMD worker and compiled DAG are both enabled. Args: - req_or_tuple: A tuple containing the request and intermediate - tensors. Intermediate tensors are None unless if it is - provided because it is > 0 pipeline stage. The value is - serialized (to optimize the serialization performance). + req_or_tuple: A requset or a tuple containing the + request and intermediate tensors. Intermediate tensors are + None unless if it is provided because it is > 0 pipeline + stage. The request is serialized by msgspec. """ # s = time.time() - - execute_model_req, intermediate_tensors: ModelRequest = ( - self.input_decoder.decode(serialized_model_request)) + if isinstance(req_or_tuple, bytes): + serialized_data, intermediate_tensors = req_or_tuple, None + else: + serialized_data, intermediate_tensors = req_or_tuple + execute_model_req = self.input_decoder.decode( + serialized_data) # import pickle # execute_model_req: ExecuteModelRequest = ( @@ -80,10 +87,11 @@ def execute_model_spmd( intermediate_tensors) # Pipeline model request and output to the next pipeline stage. if isinstance(output, IntermediateTensors): - output = execute_model_req, output + output = self.output_encoder.encode(execute_model_req), output + else: + output = self.output_encoder.encode(output) # output = pickle.dumps(output) - output = self.output_encoder.encode(output) # print("SANG-TODO worker takes " # f"{(time.time() - s) * 1000} ms index: {self.i}") self.i += 1 diff --git a/vllm/lora/request.py b/vllm/lora/request.py index 94577bf0235..31d52d8a83b 100644 --- a/vllm/lora/request.py +++ b/vllm/lora/request.py @@ -5,10 +5,7 @@ import msgspec -class LoRARequest(msgspec.Struct, - AdapterRequest, - omit_defaults=True, - array_like=True): +class LoRARequest(msgspec.Struct, omit_defaults=True, array_like=True): """ Request for a LoRA adapter. @@ -20,6 +17,7 @@ class LoRARequest(msgspec.Struct, lora_int_id must be globally unique for a given adapter. This is currently not enforced in vLLM. """ + __metaclass__ = AdapterRequest lora_name: str lora_int_id: int diff --git a/vllm/prompt_adapter/request.py b/vllm/prompt_adapter/request.py index fa25d1b3c5c..9fb7932b15b 100644 --- a/vllm/prompt_adapter/request.py +++ b/vllm/prompt_adapter/request.py @@ -2,8 +2,7 @@ import msgspec -class PromptAdapterRequest(msgspec.Struct, - array_like=True, +class PromptAdapterRequest(msgspec.Struct, array_like=True, omit_defaults=True): """ Request for a Prompt adapter. diff --git a/vllm/sequence.py b/vllm/sequence.py index de2f3961f45..49a5bc8df46 100644 --- a/vllm/sequence.py +++ b/vllm/sequence.py @@ -842,7 +842,7 @@ def __eq__(self, other: object) -> bool: return self.embeddings == other.embeddings -class IntermediateTensors(msgspec.Struct, omit_defaults=True, array_like=True, tag=True): +class IntermediateTensors(msgspec.Struct, omit_defaults=True, array_like=True): """For all pipeline stages except the last, we need to return the hidden states and residuals to be sent to the next stage. This data structure contains the hidden states and residuals for a request. diff --git a/vllm/worker/worker.py b/vllm/worker/worker.py index e1b807b05cc..0c3dfd29dca 100644 --- a/vllm/worker/worker.py +++ b/vllm/worker/worker.py @@ -19,7 +19,8 @@ from vllm.platforms import current_platform from vllm.prompt_adapter.request import PromptAdapterRequest from vllm.sequence import (ExecuteModelRequest, SamplerOutput, - SequenceGroupMetadata, SequenceGroupMetadataDelta) + SequenceGroupMetadata, SequenceGroupMetadataDelta, + IntermediateTensors) from vllm.worker.cache_engine import CacheEngine from vllm.worker.embedding_model_runner import EmbeddingModelRunner from vllm.worker.model_runner import GPUModelRunnerBase, ModelRunner @@ -326,14 +327,17 @@ def execute_model( return output def _execute_model_spmd( - self, execute_model_req: ExecuteModelRequest + self, + execute_model_req: ExecuteModelRequest, + intermediate_tensors: Optional[IntermediateTensors] = None, ) -> Optional[List[SamplerOutput]]: if execute_model_req is not None: new_seq_group_metadata_list = self._get_cached_seq_group_metadata( execute_model_req.seq_group_metadata_list) execute_model_req.seq_group_metadata_list = ( new_seq_group_metadata_list) - output = super()._execute_model_spmd(execute_model_req) + output = super()._execute_model_spmd(execute_model_req, + intermediate_tensors) return output def add_lora(self, lora_request: LoRARequest) -> bool: From 912b88beb406b40ff6e2ef7b628ca7e2d533a70e Mon Sep 17 00:00:00 2001 From: sang Date: Mon, 5 Aug 2024 10:03:38 -0700 Subject: [PATCH 14/36] . --- vllm/executor/ray_utils.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/vllm/executor/ray_utils.py b/vllm/executor/ray_utils.py index 5d9c3fb5929..5a7ccee13c4 100644 --- a/vllm/executor/ray_utils.py +++ b/vllm/executor/ray_utils.py @@ -87,7 +87,7 @@ def execute_model_spmd( intermediate_tensors) # Pipeline model request and output to the next pipeline stage. if isinstance(output, IntermediateTensors): - output = self.output_encoder.encode(execute_model_req), output + output = serialized_data, output else: output = self.output_encoder.encode(output) From 5bab1924f5ad947ef8b623641a939d696a72a97b Mon Sep 17 00:00:00 2001 From: sang Date: Mon, 5 Aug 2024 11:52:09 -0700 Subject: [PATCH 15/36] working --- tests/basic_correctness/test_preemption.py | 18 ++++++++++++++++++ vllm/config.py | 8 ++++++-- vllm/core/scheduler.py | 8 +++----- vllm/engine/arg_utils.py | 4 ++++ vllm/sequence.py | 1 + 5 files changed, 32 insertions(+), 7 deletions(-) diff --git a/tests/basic_correctness/test_preemption.py b/tests/basic_correctness/test_preemption.py index 7aed0d5e1fa..643b9837438 100644 --- a/tests/basic_correctness/test_preemption.py +++ b/tests/basic_correctness/test_preemption.py @@ -8,6 +8,7 @@ import pytest from prometheus_client import REGISTRY +import vllm.envs as envs from vllm import SamplingParams from vllm.core.scheduler import (ARTIFICIAL_PREEMPTION_MAX_CNT, ENABLE_ARTIFICIAL_PREEMPT) @@ -24,6 +25,13 @@ "tests/basic_correctness/test_preemption.py`") +@pytest.fixture +def worker_use_ray() -> bool: + # When SPMD worker is used, use ray_use_worker=True + # to test delta input optimization works with preemption. + return envs.VLLM_USE_RAY_SPMD_WORKER + + @pytest.mark.parametrize("model", MODELS) @pytest.mark.parametrize("dtype", ["half"]) @pytest.mark.parametrize("max_tokens", [96]) @@ -36,6 +44,7 @@ def test_chunked_prefill_recompute( dtype: str, max_tokens: int, chunked_prefill_token_size: int, + worker_use_ray: bool, ) -> None: """Ensure that chunked prefill works with preemption.""" max_num_seqs = min(chunked_prefill_token_size, 256) @@ -54,6 +63,7 @@ def test_chunked_prefill_recompute( max_num_batched_tokens=max_num_batched_tokens, enable_chunked_prefill=enable_chunked_prefill, max_num_seqs=max_num_seqs, + worker_use_ray=worker_use_ray, ) as vllm_model: vllm_outputs = vllm_model.generate_greedy(example_prompts, max_tokens) assert (vllm_model.model.llm_engine.scheduler[0].artificial_preempt_cnt @@ -79,6 +89,7 @@ def test_preemption( model: str, dtype: str, max_tokens: int, + worker_use_ray: bool, ) -> None: """By default, recompute preemption is enabled""" @@ -89,6 +100,7 @@ def test_preemption( model, dtype=dtype, disable_log_stats=False, + worker_use_ray=worker_use_ray, ) as vllm_model: vllm_outputs = vllm_model.generate_greedy(example_prompts, max_tokens) assert (vllm_model.model.llm_engine.scheduler[0].artificial_preempt_cnt @@ -132,6 +144,7 @@ def test_swap( dtype: str, max_tokens: int, beam_width: int, + worker_use_ray: bool, ) -> None: """Use beam search enables swapping.""" example_prompts = example_prompts[:1] @@ -144,6 +157,7 @@ def test_swap( dtype=dtype, swap_space=10, disable_log_stats=False, + worker_use_ray=worker_use_ray, ) as vllm_model: vllm_outputs = vllm_model.generate_beam_search(example_prompts, beam_width, max_tokens) @@ -188,6 +202,7 @@ def test_swap_infeasible( dtype: str, max_tokens: int, beam_width: int, + worker_use_ray: bool, ) -> None: """Verify infeasible swap request will be ignored.""" BLOCK_SIZE = 16 @@ -204,6 +219,7 @@ def test_swap_infeasible( # decode blocks are not enough to finish. num_gpu_blocks_override=prefill_blocks + decode_blocks, max_model_len=(prefill_blocks + decode_blocks) * BLOCK_SIZE, + # worker_use_ray=worker_use_ray, ) as vllm_model: sampling_params = SamplingParams(n=beam_width, use_beam_search=True, @@ -230,6 +246,7 @@ def test_preemption_infeasible( model: str, dtype: str, max_tokens: int, + worker_use_ray: bool, ) -> None: """Verify infeasible preemption request will be ignored.""" BLOCK_SIZE = 16 @@ -244,6 +261,7 @@ def test_preemption_infeasible( # ignored instead of hanging forever. num_gpu_blocks_override=prefill_blocks + decode_blocks // 2, max_model_len=((prefill_blocks + decode_blocks // 2) * BLOCK_SIZE), + worker_use_ray=worker_use_ray, ) as vllm_model: sampling_params = SamplingParams(max_tokens=max_tokens, ignore_eos=True) diff --git a/vllm/config.py b/vllm/config.py index ef56e2b6395..6a3efc736d5 100644 --- a/vllm/config.py +++ b/vllm/config.py @@ -691,8 +691,8 @@ def __init__( self.tokenizer_pool_config = tokenizer_pool_config self.ray_workers_use_nsight = ray_workers_use_nsight self.placement_group = placement_group - self.world_size = pipeline_parallel_size * self.tensor_parallel_size + if worker_use_ray: if self.distributed_executor_backend is None: self.distributed_executor_backend = "ray" @@ -788,6 +788,8 @@ class SchedulerConfig: swapping. However, when the sequence group has multiple sequences (e.g., beam search), recomputation is not currently supported. In such a case, we use swapping instead. + _use_delta: Private API. If used, scheduler sends delta data to + workers instead of an entire data. """ def __init__(self, @@ -799,7 +801,8 @@ def __init__(self, delay_factor: float = 0.0, enable_chunked_prefill: bool = False, embedding_mode: Optional[bool] = False, - preemption_mode: Optional[str] = None) -> None: + preemption_mode: Optional[str] = None, + _use_delta: bool = False) -> None: if max_num_batched_tokens is not None: self.max_num_batched_tokens = max_num_batched_tokens else: @@ -828,6 +831,7 @@ def __init__(self, self.chunked_prefill_enabled = enable_chunked_prefill self.embedding_mode = embedding_mode self.preemption_mode = preemption_mode + self._use_delta = _use_delta self._verify_args() def _verify_args(self) -> None: diff --git a/vllm/core/scheduler.py b/vllm/core/scheduler.py index 3ab8e326e29..5f5aaa9d376 100644 --- a/vllm/core/scheduler.py +++ b/vllm/core/scheduler.py @@ -6,7 +6,6 @@ from dataclasses import dataclass, field from typing import Deque, Dict, Iterable, List, Optional, Set, Tuple, Union -import vllm.envs as envs from vllm.config import CacheConfig, LoRAConfig, SchedulerConfig from vllm.core.interfaces import AllocStatus, BlockSpaceManager from vllm.logger import init_logger @@ -998,9 +997,7 @@ def schedule(self) -> Tuple[List[SequenceGroupMetadata], SchedulerOutputs]: # It assumes the scheduled_seq_groups is ordered by # prefill < decoding. - # When SPMD mode is enabled, we only send delta data except for - # the first request to reduce serialization cost. - if is_first_prefill or not envs.VLLM_USE_RAY_SPMD_WORKER: + if is_first_prefill or not self.scheduler_config._use_delta: seq_group_metadata = SequenceGroupMetadata( request_id=seq_group.request_id, is_prompt=is_prompt, @@ -1021,7 +1018,8 @@ def schedule(self) -> Tuple[List[SequenceGroupMetadata], SchedulerOutputs]: prompt_adapter_request=seq_group.prompt_adapter_request, ) else: - # Delta is used only for spmd workers. + # When SPMD mode is enabled, we only send delta data except for + # the first request to reduce serialization cost. seq_data_delta = {} for id, data in seq_data.items(): seq_data_delta[id] = data.get_delta() diff --git a/vllm/engine/arg_utils.py b/vllm/engine/arg_utils.py index 2737b50927f..ca99ce80064 100644 --- a/vllm/engine/arg_utils.py +++ b/vllm/engine/arg_utils.py @@ -4,6 +4,7 @@ from dataclasses import dataclass from typing import TYPE_CHECKING, List, Optional, Tuple, Type, Union +import vllm.envs as envs from vllm.config import (CacheConfig, DecodingConfig, DeviceConfig, EngineConfig, LoadConfig, LoRAConfig, ModelConfig, MultiModalConfig, ObservabilityConfig, ParallelConfig, @@ -815,6 +816,9 @@ def create_engine_config(self, ) -> EngineConfig: enable_chunked_prefill=self.enable_chunked_prefill, embedding_mode=model_config.embedding_mode, preemption_mode=self.preemption_mode, + _use_delta=( + envs.VLLM_USE_RAY_SPMD_WORKER + and parallel_config.use_ray) ) lora_config = LoRAConfig( max_lora_rank=self.max_lora_rank, diff --git a/vllm/sequence.py b/vllm/sequence.py index 49a5bc8df46..26e8f4910d3 100644 --- a/vllm/sequence.py +++ b/vllm/sequence.py @@ -227,6 +227,7 @@ def reset_state_for_recompute(self) -> None: """ self._num_computed_tokens = 0 self._stage = SequenceStage.PREFILL + self._new_appended_tokens = [] def get_num_uncomputed_tokens(self) -> int: """Return the number of prefill tokens that are not computed.""" From eb2cb14981ec4f7ba62adc49fc0ab8cc2f9e3ef6 Mon Sep 17 00:00:00 2001 From: sang Date: Mon, 5 Aug 2024 12:07:00 -0700 Subject: [PATCH 16/36] working --- a.py | 74 -------------------- b.py | 76 --------------------- benchmarks/benchmark_throughput.py | 1 - c.py | 33 --------- example.bin | Bin 76206 -> 0 bytes tests/basic_correctness/test_preemption.py | 2 +- tests/prompts/example.txt | 8 +++ vllm/core/scheduler.py | 3 +- vllm/engine/arg_utils.py | 6 +- vllm/executor/ray_gpu_executor.py | 22 +----- vllm/executor/ray_utils.py | 29 +++----- vllm/inputs/registry.py | 2 +- vllm/lora/request.py | 3 +- vllm/model_executor/models/blip.py | 3 +- vllm/model_executor/models/blip2.py | 3 +- vllm/model_executor/models/chameleon.py | 3 +- vllm/model_executor/models/clip.py | 3 +- vllm/model_executor/models/fuyu.py | 3 +- vllm/model_executor/models/minicpmv.py | 2 +- vllm/model_executor/models/paligemma.py | 3 +- vllm/multimodal/image.py | 2 +- vllm/multimodal/registry.py | 2 +- vllm/pooling_params.py | 1 + vllm/prompt_adapter/request.py | 3 +- vllm/sampling_params.py | 4 +- vllm/sequence.py | 12 ++-- vllm/spec_decode/batch_expansion.py | 3 +- vllm/spec_decode/metrics.py | 2 +- vllm/worker/worker.py | 13 +--- 29 files changed, 59 insertions(+), 262 deletions(-) delete mode 100644 a.py delete mode 100644 b.py delete mode 100644 c.py delete mode 100644 example.bin diff --git a/a.py b/a.py deleted file mode 100644 index bf6f24ac543..00000000000 --- a/a.py +++ /dev/null @@ -1,74 +0,0 @@ -import time -import sys -from array import array -from vllm.sequence import ExecuteModelRequest, SequenceData, SequenceDataDelta, SequenceStage -import msgspec - -with open('example.bin', 'rb') as file: - data = file.read() - - -def dec_hook(type, obj): - # `type` here is the value of the custom type annotation being decoded. - if type is array: - deserialized = array('l') - deserialized.frombytes(obj) - return deserialized - - -def enc_hook(obj): - if isinstance(obj, array): - # convert the complex to a tuple of real, imag - return obj.tobytes() - - -class Timer: - - def __init__(self, msg): - self.msg = msg - - def __enter__(self): - self.start = time.time() - return self # This allows access to the instance in the 'as' part of the context manager - - def __exit__(self, exc_type, exc_val, exc_tb): - self.end = time.time() - self.elapsed_us = (self.end - self.start) * 1000 * 1000 - print(f"{self.msg=}. Elapsed time: {self.elapsed_us:.2f} us") - - -# encoder = msgspec.msgpack.Encoder(enc_hook=enc_hook) -# decoder = msgspec.msgpack.Decoder(ExecuteModelRequest, dec_hook=dec_hook) - -# with Timer("Serialization"): -# serialized = encoder.encode(data) -# print(f"{sys.getsizeof(data)=}") -# with Timer("Deserialization original"): -# decoder.decode(data) -# with Timer("Deserialization original"): -# data = decoder.decode(data) - -# with Timer("Serialization, big block tables"): -# data = encoder.encode(data) -# with Timer("Deserialization, big block tables"): -# data = decoder.decode(data) - -# for i, metadata in enumerate(data.seq_group_metadata_list): -# for key, value in metadata.block_tables.items(): -# metadata.block_tables[key] = [i] - -# with Timer("Serialization, small block tables"): -# data = encoder.encode(data) -# with Timer("Deserialization, small block tables"): -# data = decoder.decode(data) - -# print(decoder.decode(encoder.encode(data))) - -encoder = msgspec.msgpack.Encoder(enc_hook=enc_hook) -decoder = msgspec.msgpack.Decoder(SequenceDataDelta, dec_hook=dec_hook) - -data = SequenceDataDelta([i for i in range(2048)], 0, 0, SequenceStage.DECODE) -with Timer("Serialization, big block tables"): - data = encoder.encode(data) -with Timer("Deserialization, big block tables"): - data = decoder.decode(data) diff --git a/b.py b/b.py deleted file mode 100644 index 39c6cdc3ab1..00000000000 --- a/b.py +++ /dev/null @@ -1,76 +0,0 @@ -import time -from array import array -from vllm.sequence import SequenceData - - -def t(): - l = [i for i in range(256)] - s = time.time() - a = array('l') - a.fromlist(l) - print((time.time() - s) * 1000 * 1000, "us") - - -t() - -import msgspec - - -def dec_hook(type, obj): - # `type` here is the value of the custom type annotation being decoded. - if type is array: - deserialized = array('l') - deserialized.frombytes(obj) - return deserialized - - -def enc_hook(obj): - if isinstance(obj, array): - # convert the complex to a tuple of real, imag - return obj.tobytes() - - -class Timer: - - def __init__(self, msg): - self.msg = msg - - def __enter__(self): - self.start = time.time() - return self # This allows access to the instance in the 'as' part of the context manager - - def __exit__(self, exc_type, exc_val, exc_tb): - self.end = time.time() - self.elapsed_us = (self.end - self.start) * 1000 * 1000 - print(f"{self.msg=}. Elapsed time: {self.elapsed_us:.2f} us") - - -encoder = msgspec.msgpack.Encoder(enc_hook=enc_hook) -decoder = msgspec.msgpack.Decoder(dec_hook=dec_hook) - -# l = [i for i in range(256)] -# d = {"1": l} - -# with Timer("Serialization array"): -# # a = array('l') -# # a.fromlist(l) -# data = encoder.encode(a) -# with Timer("Deserialization"): -# data = decoder.decode(data) - -l = [i for i in range(64 * 256)] -a = array('I') -a.fromlist(l) -# a = SequenceData(a) - -# with Timer("Serialization sequence data"): -# # a = array('l') -# # a.fromlist(l) -# data = encoder.encode(a) -# with Timer("Deserialization"): -# data = decoder.decode(data) - -with Timer("Serialization array"): - data = encoder.encode(a) -with Timer("Deserialization"): - data = decoder.decode(data) diff --git a/benchmarks/benchmark_throughput.py b/benchmarks/benchmark_throughput.py index d473fa9a16b..a52e67bbbe7 100644 --- a/benchmarks/benchmark_throughput.py +++ b/benchmarks/benchmark_throughput.py @@ -106,7 +106,6 @@ def run_vllm( max_num_batched_tokens=max_num_batched_tokens, distributed_executor_backend=distributed_executor_backend, load_format=load_format, - max_num_seqs=32, ) # Add the requests to the engine. diff --git a/c.py b/c.py deleted file mode 100644 index 5dae752a306..00000000000 --- a/c.py +++ /dev/null @@ -1,33 +0,0 @@ -import time -import numpy as np - - -class Timer: - - def __init__(self, msg): - self.msg = msg - - def __enter__(self): - self.start = time.time() - return self # This allows access to the instance in the 'as' part of the context manager - - def __exit__(self, exc_type, exc_val, exc_tb): - self.end = time.time() - self.elapsed_us = (self.end - self.start) * 1000 * 1000 - print(f"{self.msg=}. Elapsed time: {self.elapsed_us:.2f} us") - - -# l = [i for i in range(4096)] -# from array import array -# with Timer("converesion"): -# arr = array("I", l) - -from ray import cloudpickle as pickle -# import pickle - -bytes = b"1" * 65665 -with Timer("bytes pickling"): - data = pickle.dumps(bytes) -with Timer("bytes deser"): - pickle.loads(data) - diff --git a/example.bin b/example.bin deleted file mode 100644 index c2b1a978ec099ec59ae2c4e8add10a4dd7857212..0000000000000000000000000000000000000000 GIT binary patch literal 0 HcmV?d00001 literal 76206 zcma%^b#z@t_xBr5ZW1UCg#vAFZbjPS?oQm3Pdqs;7nc?% za+AROo&4TEvsRw2tUWXPjGgc7J+t@BoZF}T5kEZk`Jt;Wx@^_?r>|H%9|eck+d3(OOL#D8ddgwBI$Jc167 znvq!BV~xhzj?x{mw#6DnWg*r!SX)yf&Q^^h((+bdZHcu7#bRuZwHel?Sewx3QzVY+ z*cfXg+EqwJ3$XIB5?FayxwKSc-?R1b>cTVf2H#mIyX?Kq`LDn zd`63Tp*(|G<`fz>Q)uuJpCT5Jrri%jI{)7v5DS76xn1-g zOz%6SjwDq=-z>`CrD+-cUomj$@OY*Vh3EX4!ZTfXs{Xv?GzJQ@4xpvRgg?=dFyS6l zeWft%XOYAXd?~5;iI#M}lwkZwtziFww&&>lp3d)-CA*H!Z|VGo8a?@%&adc9QmuGQ z)!=J>s!Tbbh~%Y2@_!X6cmnHWtdp=##5zHY+=sA^$2tz{Sgd2Pj>b9)OK={6bvV{x zSchV1xDLjWEGT{u)`2QlcoEhCSo>q`hqW)(KH?=lqVXuYPNZwSQduw3u$D&Cbk?)< z{)gVTX?=-QF?}=XtEJ|Bc#-mthetGHNJM7|(F`M@))Y07QG+#^)^D+PreX}%B+47G zCSpyXT*>iRJ7JB(8cUgyV<;U%>vT%SP&`JZ^Y*5GJ&mEF1@kBxL;ENid^+-}cTE46 z&|tEq{Unmup87G=je*Z;$$(9%9Yfpm)R-LEwN#IxN=-;)Z^XI*>w2vJVqJ%IE!H(y zS7Tj;btTpnG(CoOIo4%ZmttLlb+IbQ{{`zJx=x_$XS(m8`xkoFie1=F??LpwMd}Dr zMbu=_S400CYCmM)lHn1|91_9VLNId>!3)(y!Q)sLV4aV39@e>7=U|f=mDcYlYoKf_rDJKG zLCIK(^;fIxd3(_qQ8bpi^|WtGOD!D*VrTtGeJ2fLsUJ(Z;7mtRYdHBNVg#PqTAA#%jfC!D_~8!fM0SF})r@f_B(SkGXs79;&(tf#S_!g>-*daOz%K#yWQf~7}N z`)|*&wV7-p@8rG{=uVB4Q$1!vsf%OvBi&!sUtyA{G^G)m5(|9bE zJ5f1~ik&Eqs2E52PS`t9J`Q^vW#cGvC~cs0994ysjH7rQjWI>zR61`N?b}hio|Zbg zj-%^GQ|&Ce&AgsDS|tA_diSUI4La77`hu#rX`UcnEsrl^;;b<~cdMhJh6^%YhUD^`~58mh&MutKZ= z%g6G>&ZrW}`G_X7n&+>_DpOb>5iG?j!79co!upbqW9dAS#^Y#Sq33zW(mIP`p|?5q zX4sq3n8eL1U8?%hSJ83~ z^=4YkJy|4i1-<*xx{lPLv~EnLw7QB)Nn^;sg~MZ+J0zC#gk`S5vSvLU>S6|VI(8cN z2&{jx{=pi+>c{F+2eQ7$N@4Yy=dk^d-*o(1u$KDNlSL{XD3$Yhq&sW~e|6_W=^hqXFyNHpgQ%{+sqJ=ZjU9riBRbFgP)&%&OG zJ%i?z)Jhu0VvnIrlCdN94%pjckET?~?Xb6{RryC@Z$pVVGS?fhx5C~MTb6nQjbEwm zygg|jO{ZPI&LRvL#Nur!L_VSP)DlHX9h zlX7KOQDwr63Xz--XnI;!jUeH!+u*r#9{LCo5iuJ7qSiJ}gAo)aTcB9c}| z@9UI@q&`zBGeut|r86-kp2oLH4B~kb_KDahU>}ct9QLu;$6z0geH8YQ z*hgR=u10h3#Xb!CQ0znKP{xDl)N_r-LD&akAAr3-_I}v=V(){!5_<*qa_qfTwD4SP zEzu{?tS6Ib{Q-L-_5|$ll#i!uJf-6)>ZD{m#p7xGnj$3|Xy0B%^On+5PxY@?`#! z=Y8xe6lRoTUygkl_NCaDU|)=V5%z`H7hs={Ej1Km=U|_WZPv^DAF=&@tVXwtrgZ(`AbJ)*fKZC7?pT>R)`$_C4RIczG8c$T1soqVe zay<4l?5Wts{>_qJ$o^GhJ1dob23-cw?DOdUfu3inF#b;7FZ8}j>OZ7DrFkd%%IUwD zexHH!hR3mBC>$3UucpA@STUZ~IclO{SK3We=~)zQO<5mBximCTKY_Xlur+NjQ#*mO zAE}cAFoEg`R4M7xq}K(z0lOZ%P8H<5Cz4TyU5j0VU5#CZt>LM_7XIRwVwYeSV;5n6 ziT#DL=Sw(0$2MwK@SsTU-E^EtrxfCGr3y75k6}NG{RpJ!Q&i)B<~(eE*E?(i5E4vC@h zV+sd#`3~&cRez==?>6jPv2VeafhJ|U3HwIu8?djZVgju{(U48KfSW+6lDkkcf#L}? zenXLfHhx~-9+W5O7zx)XmGv#fxm2#FL618r*0TQ!b}M!Zb~APpO%k|9>{qd0p+hjd zOy{vGmun3A!n0^Jvs>oXR8CTu|2CBqshET_iHeCh`kO%WzBrl@#FT|3&Nv*I^g{3( z8rD#^mY!$m88k?qroNEWfuugB?*dBSrT-%OT}uClwPm9r@iSh`Mh3sKiIh&HbuJ|n zDV|8p2#O}sF3;aA26;>9*aWVn#4G_>>efUPE}~*R^%JRYpl%{;L))v=YJ&S7`#Wqi z{Aq?Hw!xO4tp{IYe}$dIj;U5M!Va+mY@aG6J!}`-!M0T>Te50lcVc&7w_~@_WEO>t zQjwf@Me@~(@zDz;gr8u4jQtVzhu9xrYp!_@`(5mJs1=G|)9^gq-_f(09Htz7(RV(5rSxA&wN{jKhL^&PheXkMF&hs`;oI16VZVv}2KMXNuhDTboyN>al+$<$ z&HGVZNaMGZZ9zo>M}K)Zxi~r0e}|KelZBIslR?L(IO#ZPI3uwC#r_9-0K1=N>2n`; z3a-QM#qPoG#_qzFy)+5iYaLXD#B zHSC|THLElzd)haqvs;T~8^0=l0cD#i%*v=LJYLbp29N?IRj@m zoLzAi;Vi^iK;x-6^Ks_k%*ELSXAaJ6oLM+Cac1C5SE06&muHaA{hh{tNPS4(c}k@pM47B`n}M^2 z$8(b*@iZRICc@LyZ7jzuW8!5kpzU>qnbN!;sGUUFdTJ(7YkH6{TajsBQ$2~QNs4Bd z(X>Wkj`0LDigC`uITJ^CpN_K%=QJE4eF~15;qt%5ISI!su?6>u0(-=Lnp`aSo$HbsUOw2+qMc|G_y3=RlkTaP~Ll?!eg(M-${eI4f~h z;4H`48)q+^WjK4{ETz4KuCJ(QXkNFE`7!HUqHXbz?s9N zx#^H-8lPrUBc0A3I7@IAwId{s7%m4*1Y)cbZka*E&P{;ovC%GH&#W~ zeCl?l?G4zL+MTJ{nbP%QrOf6+DA;fx?U;k({@1jO$$}WX{7m7v~(Dv&G0a>RfOy&TBZY(xDN11?Od)mvCOh zc>!k~j@nv-qdhpyl}ReJCAXG_7pOIoSzu%_>q%-xk$RuLbLcCg|9tvowEhok&Spa* zY5bYZ29Y!~dhYEw&*JE9R^vR4^AygLI8We6VjjbJ6z36~hjAXlc@XCTocmR%@N^ni zQMohDG@Pk8#)!+jf@TTMWGciqIOd%|`OcIH=$$FqnWo(--kGAEsm!NcTMk>(TnE>~ zcFLP=<6MnXic^9kNiV|r66Xt?&v8D(`4s09oR4un!ub&A103Vk<$sOy9*$X!3htp} z71cs8rcS#36+Oo2%*vweFolKk38nvkQ2I6{TD8xkO6$!T!^`64Ln3HAn9cvcX5WTm z)U9xpNRDJiV|@?K-8gsQ+=(Oo5x~lHE6y!AH{;wy2ac-blx+bGY$^WW!rUdOe zoNIBep?tEb?sUo~Q#zTF$uupdcrrzkX&-dyPNZXV<;dHO=6cHeXqZgBDUy|>ZZfr< zw3(G5a{;xJshLc*c(s)NNXdGtlx(0{o~bO?b+rDB(})9315Q0o9gUOeil}>@W`~}~ z!FcPLw}=#u7s;4F>OK0-rteF`|C{~}6Ye)WmRk&orSWFA7?j3ZoEn^JoGP43oC=(B zoHEm-X({JTI!;%ZsG{+7n*W2Vbz=&pQz)52(-MlOP&9?A1m#;OSC(0b^LC{y7jB?o z3iU=rvtpc|shdKZ^z|phm`6!3&W|{ni>KgxkMkYQw>Ywwl$3mZg_FdQy`*G>6XFCa zmzESsY=I-oNe?_67stV|DPE6b;dD|n8mEJ*)zmzzyy>SJaxy)S(OysQvvfXBY9CVX z(svepU(kOp%~KdSeRveN916vS#+TW05XE+?G>~mLtu$$VX`xQ*VzXM$mS#06%rM?Y z&Kn~6dO5So7u=2WHjXfP6G!`cXW*D6D)&~C=OP-7`&W1xl~Zx2QaJ@zf0JoG7&Md}%>KpshdSXrK(o5qkJE=EgGwr) zvFO3+#_7V5K{XZUFPuNAlwOC_u7UgMd6b%$XkSE|4!Bm5dWXIo&t9 zxYdvd8c$}cK?MK6(QcR8{tf3>oL_KcvnjcrHnR|CMKIcgbE%z5%~YzVQZ-eKv{)pu zIaN&*t*7ZlTGvzAAYOK{NP444#^)kA2HbojK?QfwVN9s>bt1X9(0C^9eB61s#$G5q z6?Ye0k8Mtzwr_p?fo@8EX$R#2>`pDi6l0o%!GM`v~EzCpKgc>pND%c z?m4(;)sZ zDAkr9;~t57grWs^(xEpw9QQEXLvatmm4YhkLAVFv9)P<)?tZxY;_idH5_bjea@@Uf z_rhIfx_UG2p14bK_rP6(yBK$O+}&_@#a%?B>RyPuKpFE+r)eqG(r39LThZJ|MIUYN z!VXkVqiUM6q?utz$dVaNlQCm5$KYN@tC>(TKE-_t_f6b4a0fLZqC=zbnx5rMr}|mC zV`}7cKP*Q2@$^2eu;3h8t|j#*MTJ!SLI2s5X#GBIcpOI!iKFp!MhQn#^BuUa;=Y3W zGVV*bFXFy{yAF3P?i$?Zai7C|7WWz4)woaNK85=v?i09=<36Sea&N+Y6!#I_hjAam zeGvBn-1}+NJKcwSFYZ0KcjMlLdnc}1yB+s7l`E7`Nrw-mVjATJv&@Srn?^}MspvFn z^2N(Ii{fb%O{0AhB{G=I{GGQDZl+-x_0wp3kGg5FBeg2v(EKSCnp7>Sh2KL|Nvofv zeRtE^;5^hwTLHzgf6k)+OVi|U!y~xukO&&zX50TmFl!Dq)2P!(zf9{dxMr6r9o$QC zFTuSS_afX2aWB95a=NN2yl zC`^o_?^F8Er2h+w{|8%qyCHEjUd?t!qdT^yT(*iav+@?upgv8f?S1N|Q`$?d>3&u~ z&2(xysh&=iVPr#1EhRtF^a`!N;@08Tik&|Kw;Hz!w-UDkw;Z<&w-mPow-~nw_enIB;8bRMD6py6; za|TWwUJ^$SiJW77hW#3s1yxNW$yp(Ly=)Q`k%#%;oF#FcC};MSXr|I&Jg!i0# z#wdGsr$|~a+)It_R6IbhF|)E)(fb>1$0(XNlD_@u`oQyz^+iQ>l!PL~?XWalXQQQ>MU7Cb>EoH=LZyY1TUO zEH!JWTq{OaF1^OB%GGl7tHL}TBtAgLL-c(_{~7du#=yzLBf7(oh#LQ9hyRn$LLEdM zLD?+R!bx~L;_ZO9J)SHz!^_ziPdi^TDQm;qR*X#ZW;q+-jl$C&n51HByshxI#M^>0 z@HVGd%+2sN#oL5ZjgOKeX_siJE)^`KTty1-^6?UQd3d>aIe6K4S$LUv8F=Y@+hpiH&Lfozm5m3(4o{8B5Ro^p2z9 z6&lQHo-p0d&{5646unEmN3quIlZMA}$02ZBFw=NAI~vLK&cr)|*1Pad$6JMW8s4dR zr{JB8Csd_b#!Ab15$|}@{)c$S;T?;24BpXrN8ueQcJ5PnN8m~O4#PVX?-0C$@l^Ri zdYJFw9f)^;q6N3$?T@z~-oAMI;H|`4fwx>w5}kN^#c$ zG+uzWE8Zg0+JEsD;w`|Nk2eo*F5WJ9bMR*4nUT+!OZWD)ms7Ep(jBPKYV!oWznJ#V zq~&`0_Mz`X`cJ3-Q;PnFwPwta2pSJ(j6tw+7R?5P!sDsX{lG!lJBx~~X`f1S8{AIY z$6yp9?H8Kfpw+Btl{2>_YB@@ zyr=Pm_LF!|;60A_7~Z3JkKjFw_YmHLG+v1J0N(w0_nCHT@$OZadmY|Acz5Gzr0>ML z1Mhac+wgA1)1b&A5O_D?-AJ>>^ae^+(>*An2huZ!p8Mz>OYh_KZlHY+Q{8&{R?_zY z#f9{L!oZ2c%i-7|Q8Yfy*g-ke&dqGqTzH&F=J}N9nn*zLEGo?gMBc#^&7#EE8wGRV z4jKm4elGR1Xw&IiFLkr1HN3=FYBb5|&rvFC2350YdXv^al}d9>1}U=fB@+F#3X%8l z-o<-IjEoONa$dlD8}BVVZIB4bH^fVSAMbU%*YIA&dj;=hyqEB#5z4!c#*36C|5BQz z4p-w{g?A;M%%nM#3DV2)F2gfkWp)GI+fgme-jSYrsn!APV-$O&p3=j@I{H@7r!DeT z^nc923B%(#Zb&?hPcv>1&r2wsL+NZv=1^jQ%_$VgJdffz6wjt|Yl`MjG@JHm6hnoK zr8{Ym4rz5!xBtNV9q%`l$dhXQO1*KLvzAe>XIf`I#oK_Vbw&lW12h}PQ#+fQ*;JdR zvK*=u?WD4UhLKd)Qv4%c6<#G?1ztH`8D1$~37+xg6Mc9^wBC#NCEgc!pW}Uor#3#p z`xx&dybmdsJeo;7e;0aoph_#u74Pto)P=>Ld;%{B8Ysb^M zp+vJ4uZ7mX@S5?O@EY-qVx*Z~B?|Ex@apmE@TB`Sc-4lTG8v3LTewmr&x~w_K`Wz! z=Fe%AmA{r+S(Q6d|BBcJdysm9&X?%hn?{`|o<`A!G}a7{>G&ZrHU7ai^6Yyo= zDY+BASy>B@!5@pSy-e|T6eIIok(>hB9olConqjmcJ(nt7M;MLQg(tg8BK{ZkW}72# z4sD;Ca5iLZ{`vUl;h&3t4*uErXW^fTe}*0wJdb}mt%K`BFa9dA@&;(RTVck#A~|bB@-2}9 z+1zIG%DtLqGcgw)jlTo__V}amWnBs7ZShCpZ-cM>FeSIb-x6QDQ;KhnzZs=%DwmUw zuNyor{wB1~q(~1(;%|(<5q=@QI;#u`{5UuU90)6c=r#?QjfRQ(z1O$L5Cej5G=ynpfjp}B)% z$;$vNl;|^oi~V;_7O(Y+zI#4pcQ^DMVnFc9p$~sku{b2ZE1R!vahHx6JFlN zBAG__(~Q6-BuA&2um}HC{8MPT7yo4ZlkiW(KLP)E{NwO7nP{evFsp+{;y;D|qzdJC z;6H(HgskAd_>bW~ivI{*uhG2?6_Unn>A8!tbb22owIiuVO~%t{xrx5t=|7on-=x-Bm58XKfr$< z|2_P7@g@9k)AcId+Ld{pvbD6=(5s8buaMGy`5Uw^Go@Ynexv^+YIK9@sNqqZJQRwv zj9)W(5XHCf-^70d|8@M=@L$D$1^;C=l<_wHOZYG1zkt6Ee=YtRd@1U4_|M`$gTEU8 zX&Q9^dm`;xm$y)%>>QDTC3I@u`y2l+d>KmfX%M`B;Qx*722Y~zSBl=J(#HuzY1TnD}z~&E~jiBr6zCT5tQmQaXuyUD4tL8JlX?_bg6zm z?XxK|=wzNHl9NYG9<};zZhMNr@5b+d7XF~)8JZ-Atzu;9AogXNN7B40J=@T8 z3BB9Wdq1h|DY0oVoi4nDzF+7+(d6!6;E3UIoH`_q#;ch+h+{8)3r)HB&G=3Djri(h z1AaYz9eyo-4SuznOViX-r7(LNkwhMTC4L2dIV}$u#@`|tZ;Ir%(Rh`@f@=vD5X>i- zM=+OQ7lJtivk7Jq%p{mWAY1cf0$B#iHkDus!DN-oy_`S>lz5W}CK5~_kO8ISP6Xo! z#uCVOQgTOv9hANBaN0xKjct~Brbv!HYFq^RQtl-hcA!+J+MCm(t#++2_mLV+>S5{) zk{S2Wpq=p(s2|0^;ltxNZAcuAH#1E*s=~Z|O|5w}#0t~3!@L@oI(nMzjj9q4^Yrxt9@aPoNzrLAV{kwgjUH%#aoy zMu~V^6KqAWCBYU1+HP7vdql7qT`$tTm2%}wre|wyAl)-Xayd$+HwdI1bGCx z1d3~md;>uiK_;C$5M&Uf6QmK0!2cKjAN&FQeswlO^4^D^!k75>(6E5gji_HheN3C2 z&8gJsCdC4(zoSM}`}11SKcuyfCbf1p!C3@n5}ZMBI>9P}(+EzbnRyQ|6dFoHau=KhQ!VIFEa*l z+k?hy2$m2mCfHr|=eH2-MzE`C>S}^T1Pf`FUAmBph2rI2O8G(pqp^uE2(Bc!fPVq7yAh?6zc7odo zZlziBb_;>F#59^W5!^^{1A$B_CI3rs9l^CKlzWLt;%t$^LqyVb{o18nsOQnOmX2$w zZlz~SdM==MYkKb?HA=a%Po~Hiy@}66@?WO^So;5<{O93uoHZnl#)FylfAeGJ>8dhE zpUM47*=Wi)QYs^xHtk|hqt?usdDCe6nA#><2WZL{BWohTX9Sj>5otRZ-w;5h=b zZJD8r&k(F8c$(lTf+q={Ab6bMF&YKrqso%5JC3VpPtvkGby~;G2~Ebaw3X4dh8DBN zXJ{)hostZCx1zR>W?7DJP^)_YkLh9dtMng3|L+VOGCYd2heXl%F|&;{GMzdN@{a^R z5Xd&uWcVGm;(SZ+joQl5QM!0?`o;T-AW1NYtS+sHt(8aYkRTw?%Cm^3Pw04_)_-U! zpt6&oh~P^CO#`2+u0kWhiQh$XpBKq^)g&E}{3ekCSxu)?xhoZ_Sk}_6l&fyD-sE0v zGM+`5ZeuN?bXQ6jQL-x~izwce;zd-}(f$=h;IsacCSHW+pJ4b89Sb{z(w7{eY zdI@?6x(T`n2CW(`Cx6kP*OWo6ca)Um z5hZ`9jm((@H3ZcJRRom;6$Ir3Wdx-JB?QH^JWBgDH2#-xA>jhT`Gm%T%x@%|OSp@o z1y>NxA)HM(i*P2*XAsUHoK7eMYBxHrqWcKt%4nfybDFoLSATbrGL}N_Q8a%?pRvz# zpQZmO+8<}&Kf@!q%TNf;HvY>lMi#@Vgi{D76UtoMjc^j-M8XM#GS`&ciEtd@SVE&N zxff9;o>>eMXA=M_IVXd9BNbVLFO^A~4kKq8{^0E% z-TM>B9)U~X5ZDA3K_?y3><&u)CTOSiUxGFQ9q1Mkv=C^5Z6atSfS`e(o}kWDKY^f@ zmd6xL7+EM>On9imj91J<`Gbek`UE|j(Q_WXTcEAc)K;YKht<@-qbxbA=|7VGUl}-P zcs%D0iKp>s=Kh~VW*Fx${}w_)wJYHwlQR3J1vQf|r&GeuC7~wU9S5v+_;Z=lJ5=vQRL@9Y0 z;iZITDbBrsaBylool-rxi10!}@nlXZc^=`pgy#^RO|g<^5lT4O``a}~=m zmR0UZN_1{0>qomz+M(Ez)V&nx0^dV)zE1z)G<-;vH#~~-heXkMH1iFLb$Re7S%SflKlKS0Y7InK*yE`?zQ*EB)1R{BpsoI?)UE*6uTN+Jf zBP>f2^%j;g3lK6%2V1?PQ5NdWgi10wAa-ShQfN+1p{RsCZ+=p-_ z;R?d#gnJY2MYxP`Pi4$_iEt^QBzg&<)Y$YQzkz0B02OEyZX+~z0tz%5w-DZ}X!^T! z8)YvTLC+@ioI{D$oI6NuL9@0xw0b{C+bMKBPX7iPMh%bWf+6uV9?b%SXQeu&8%-MA z8wqb9l)Wa7E;ETE2(G1kF=dM>U92wUo-dMEMTu^KD^n5SmsHfzvXpumAo>8*$Z@7| z=(3L%$r(Y7nJqI8HzW;o(iIbaqEz0_bi6>*=d`8MG*XN#eN6u(;fIPQMi72L_&(u# zgzpl*L-;n~TZC^CzCrjp?f)fwjqp`MjmgV|g7`&KZi7gnW~dhk*AcE&D)$Mx525=d zdPdT7HocqDdpoJkseOa$x2Q97Wnm?qZ_&S=fdhxfa^a9z8ZT#|!7^M!_&nirgwGO6 zLRJ$#P52a{-b`{OrIt!Qszx(jBz%PMVZw(9A0&K$@P5Mk2=67lhwyH~y9n@ zie_by+Kkja^i8Ml0Wk`Aj5-tu=SkH;umgl`-QU z+V`XXCkFN(9>HCQL~yYi*v5sgF z(L$mHMDvN}5zQsqg=h}ZY$DyyIG1Q9(F~&LMAK9vzm`f>G?i!y(PSchSR+j8358-U zVGX59RufhcRuWdIMDE~jf|R+Ok|jzfw62vAmJ*gwtjxs}EuqAx*32F0W1x#p<6mc< zB$9okA^%V>Azwn*jdUMG_d1Gouk=iMH>USiQk%k4^i8EttM)#0>W8e-R4A)`cno(N z5<}zF>}D`5l_kA|D%lIB-|7ESr3+UQD4QsYD3d6ID4q5jiPDHh(0m@z1fuao`fNc!k0TmO zq^(88cO=?@XnP`UlPS3!(Y8dRh_)fp5u!L|Ga~mKk%IS#wp5rng-8~c9&Jvf-Lf0# zK9FvKw-G&OP^>ff1*9Y$t@KTy&zNy}B{XlKUT3ZU!4~d`gWp+3CMVk`Iz*FlZ ziOeFKcRVe7DNHv8cgFctjHakf(QIQ)XAjV@2X$(E58A3>E_HjDy#G+U2bEpa>_N3Y z-!Qx!m!@LMf2A#(%I|2}OuS6v0%nb&W@}oWCen+aOmq^_i9{z59Zz%|(Xm9w5FJf) z6w#5i-$Zl-(cwgg5gkf&2+_er>h?iI2NE4Xv_H{)MEk1ZjI}hIwJ*PhXcAr5(|rKl zYv?JY=X82EqFVOONK$uEok6pii3(1p_5=EVWMJRn<#6#(ILRNg%F%bB!g@z zk>o=h5VcRrKo45HI15^ojJX+*|K%2_3nwKa4rOy7adofT%D zD3Wa!=WJQYTEv&qRs-{>TT1N)YL}Y41Jo>~qKoRKR2fE&L&qx=XP8oE{>+;|)8>5n zkD^&S(z2TNZ;0L}k_f7U?-0FB^cK;ZL^`j!ndo(**NFBZT1m8mXgOW_K5_&d*VCcz zHg%V@kltHJZA|J;s$}ilL;ZX7-$VZo4D2&JmP>}j(s(mV24!+@qP>Wg5$#E2UMFLX zNdDDCH&Q9YZXmkeFsg}U0*P@Q(X~X^5Xp>I@+zV$iLM}$8Li}HM3)j>LZrP&B`+en zkmv#;<3T1QeV-70Op%)Ui0DJ252(|At_)cHMB!x0jjNY=f=ITp-Lv)i+@91MqcKYp zxX!dTD4O0+c^4h8(o{mVwt;gf*1D|?;8v4w2U?z?{aZ!zT)OvHnAAjX$%8!Lt1pqF0GtA$pnUC88IJULaaW zw3cWM(ep&l5j{)v4AE*LDdkf{PZB*r^f=LDM2`|ZLi8}vLqrb}JwS9n(S1bs65V6^ zsS55Ux{K&eqC037thW>0MszEYTGlqBaJ`8N16Y0)<$F@LC#B+*5tR~^5EavUASL1y z5q(MY1<~geEBTp9=m^xaLfcE9g354Fp)e0c2k91=U@ zwd^VER8M{-Q3K`bMm?Xt3+N5&zHA|ktgno=i8<1PlpmqFv;tb++;xytBME}w*G5m*UfT*9S zk0_-Q8P5{y?C>JuX~a{Brw~sj)_2Bx5lFch%=6OEb$o16*tRaeg*Lk zM0Ld5({(M~`-oRKQzSPKCD&3IG<7hCM zvW?rE-AlbmWTil-D$A%{MvY;l8?5rDQN4_+Wt99w$7{4DXetvsM?Xxr6{TI2Y5Q0U z`E#a%?TNFAvxqZkznxfkEhFBOcq#E7#7l@56YoyE8}Y8hi-;EzFCd;zJdb!TUDwdP zl5QiGS!+cyrqP>6?+v61s2QMd6Z-CiHT2&>+nEgPJv^3s4T+`kUG_3q#=8*DA)ZY< zi+ConewFKD;;o3cr1fCpEr>U#L@%)!@utL^P^|b!;*E(nA}*vz$pYeh;)HseYYrz0 z-XtCrNtsV8iF1i_#LeEEPGga$Z%2bMsxptI#!O_{2UBm-**%nPM%`Z2?nRAZq?si; zy_af}$k)fBdr>M0FQ?;mn#_i5j=tvaqEr)(z9BS5Ox9@PV~CF?K8pBA;v?v~n(h^J z->8R$HXYy6YbLAoV@c(cx|Pg7tM>-Rs6(&cNzaXB4t(zFd#x{9Nlj=IYA6UE<|BHPk_7x7cXPZB>t{5bJr z#E%j`Li{lCL&OgfKR|px@qNVi65m67H?fM`snQ9bSh}HkQR~8GVq`o`<#H;PQ@)%s zDMA%-rRhQ`aRqTXrQ($lm(nVIDj_bW>k8A0Lb{Dko+eTtu^e3L^(&eBQps4-1$)vU;4`UTjxPI18;=&r8W71oh&nSM z*GboiIaX>6t@MylA*d=y|ZDLFHXGx+ui93j8#H}E1BR0N6 zh9s_qSeBf4O~j4F2DC&2vCJy*>WS-!Yl&qUC|PZ~T0-dxT8+n;d#XsmYo^38RE?%> zo}%fasFyOxO=(mr^C*$*0}c6`s=v+iE~+-r@itA{DwWetyMEtw9m#Z(X(UGw|0{OE zOT_;W4-oeg_YrFc@^TuUqx)J~ja8J}O|L%K7p6I+Zlte}+I0G7P;FM60yE}0j}DLM zzC$8ve4Bj-5$z@JAvWG}-pj;Y#D5e2Mf@l6AH=^C|E4GDpVE8<$w^eMBsr1f1d`*$ z&UlgvF?LckUysL;97}QxN&SkN*gKHio}_+WQ}OLc8X)rJs2oAb3W`@yw1T8r-V^VU z%qE#dGLxj4`x8e~^fhg|I%!t<^lfO+x$g>^lC(8Zzk<3IVrL#Hl6`<7e^I-FniW+1 zL-h)(%+qu;2KhVFbs62u=)T5eJ4YnzIeJYKIb%s>Q>9g>fIg$`*}7(N3!U@n{2y*k z?KdQ%#@pG?AlfmgyE9BPZM3{VoiRBIU!dLidU-FA6dro_gydq9yOZ3FZ%xBW>i?wcQo8q~TXvCQ7k1FA|D@n2Qdy+Tnx3uC4O^*}t$s6gAJKNs@RGUz zkT@F8XMclZlYZlBCFL^hR;vC?{fw9iv-dYd67e;a|4_4%>XlTjq~l$&^CyX<|3%Yw z3UgX%(R{aolRS*1mY%Cf-b(Tok~fpQiR6tWZy8@eu`dnw&lQT&yj6i%Txjn*}!GO5yUwdd2Pix$)AzlnM?8ihr} zV|l=kSQ@|Q0E1=gQ6y#W835DoC%F|#vum1PL~;v~GSejkI?LRI25rskL(|vP??c=F zW9+P>~BZz!TLhs^7;i%l# zTrxAsoI_>?ndxMvkvW@;iqEv+!n5c&kftFy6B|*dZ5eIUYMlRwmd$BX=GeU`DdSt7un8`5FT{ z4{|+&g$&-J;M8PlR#5+moW<*pT_Y!rilLEpCK(JfwNxl0HDs#ERFM&3lh??`g$gJq zQ%0tgGKHy<3b!v5lm3E?NR+}pC-WJZPsxZx$@>YJkIAec^AQ;hCm)b`pUit?-X-%6 znYYQjMaMzr)n@)ZN9SIoo2-ErhmLj{4KhXAS#}cjtEo4>(=36q~kolU-S7cU_=_b=fMl4EQVh5RaGHql;pyX{K zV<=Cq-UJyDD1|kezwjs(CsQuf>eDDYnNrhDq_t9bGFBR^5UT(yg_Vz$hn0(!#LB@+ zm^@CS$#B;~)4e1%q{XF5ozY*Ec=R4d?-leNp_rmYblpzBv^>4cz%m9?4Bo`h&I~PQ zcwf2?92>#OqatV|p2Ory8N87ijo#NY zypLiEJ{ud)(?-K{UnBdRHc})1rb7z6ja*oGI!!0j;>nqis`?HZPo`O`_$O0;GIb*( ziU#r#`xk0gQ*$z1x;J18nmVYn^;Fu>&BU*?70AwSfiGy2u>KpJ4`LZcQ2acWs@h3d zCt{s|bv)K_SjS==gLO34QCLSRRly5bM_?U}br{y6SchO8jCBy!fmjF7Jr`?#to^X| z#o7mJZ>+s&U52$M)*e{9W9_DQb1~&YX*A9LezdG#^gGDb$}r-6_i8rEbi<#7_$L@dL~i=UHG@E@#u>7Ivm z57ymScVXR$bqCh%v|f*O8`iB@w@~+s`NC`&h0EyEc8+!B%zc*rm#F-KfomDeW$*@u zc4TN7b)QiF_1Jau^ik0?QqSolXx@x<6V{DbH()Kt5_>xpOGSMh*0oq7Zt`A@brsf? zSenm~cPZ8qld%V|7Gr7HSJ)z~g$m1g7;9tM>4jLLSt^W;umsEol*zq5)_Pd$Vy%NE z?yZouu+}umKZ7;s&@Q?4Xz^)2g0BBkVFroxHZl?_44Z}yIa3p$M0P&3X?1ZmoeyC> zi}eiF(^yYoJxTM;|&X6k23Wimwe;XUYGe$+w zNI7SWAov8<<5-VjJ&I+>Sb@#@8Ca)dO@VRLuS2b=_=Ts@bSm{pnzi_A>Ssb?g#u2c?o?_`K%SX4N$p7Q z!SpVcGuQMq$<1UGchLU={ZdtYHRZq1q~)Cn3|&Wyu7OI7jpLc4;%KCtGYyV4r_!{F zu1~36P0LoarIEIUws&Zfb`Zh&D%LAlFJrxg^&*z~FSRKlZ6GavmhQ`FT~37>T{Tt} zmX4%XXPZ(+TO^#<1KSg$Ez-iCCT`c^PPPbX9HEWHQOdpUikkIcJR zM)8jH|A+ozsx;0>yir@zICCw-ds6f1*!Z0_Dt<tw1N7G z)J>#zA~h3D7LGN>Pcm|UrDbcXR?C^*nkKcRBI-M+Hsee3TiV{G^AD9;`pH{=iy|^*fef0maYAD3DUo1+?Bk^+c)!U^5n2O;}P8D{li$JV4nbN+)5jg}o+rF}6BxdDGasOR0+91L<8vgEp%fikQBfev$Ga z2CiZ-!Qe86wrA)XhWB82`Pe8<9Ti0*(M&Zc+6CAt?0oDz^Q}*?bFq`yIoJv8aoB5M z{f+e()-cu()*xu>>0wHnZ`x#`%3%^Mx_WC8%||I9u`XSo$<8~*7*fuu3y}=)iJz$Y zi8jqTh(zg#eg{=Hora3!H_@q{#IUMjb!op~{fzY!Ru(J9im*bg0L#bnuw11oQ0-TD z>tNYf7FGu9N30*PzNc0F`YNpN==hu71L$2ypNNQ&=?kx*>n{2S8Mu-`ZKahm(`JUQ zW_WjouOA!5vqweI$TVk<^oieMeS`Hi)>l|7vAVIkutcLLVRc}&W3^$mVzp2;iAoi< zR%E34OO8#k#o5lo-Wc0R6S?hUSWX)iOC!%r8>x}UV4MF@{1o<4*hgX?fqgjkVc3UaAA)@__Ca(n!aflD0F#?% zu=mH_4|`wieXyl$$?XkDR)`GP{ zw!z*STMO25Z;8DH_IT{gX+K{d$zM)~5k}HlNsMSdhFbOD|E6S9n!ZtRUPR@{@VzB% zYssI8DPN5gg|&S^eLM9UpD&~DKq|ISWYGc{#V69=PpctX$(0OVOO?*VUd6B!&i;eG zar&qj8i{7QFw{Hq+GXT7P}4z$8roxYUygk*_B{$NcpCd|?7Oh<#MZWxo3U@lz76|U z>|3zq6@E8j-=MUGKVUD%R@Xfldl~k1*w_Ep$dVv8opyA*o~_F`-?BzYHM zFT`Fz`vurq?^4Ko?0MK5Vb9fzbCmYcKoCf7=nrX*vNnTa%O=+_2)-3VlIm(bFsrhWg9mhEV2p>1uGp?zhfwxe-F*~L#{ zKZvc{`UK1U*!R(Ti-Oa4QGFVA8+I$U`UJU~vB7S_R?C#P0lS{|S>}BY%gDWpG68rA z_Qlw9ur-S$?}ga2v1eh6Cdqp~w)&BCv1ekRgFOR#I`%Z|v$3aQpGA+EV=3B)-g%U5 zL7#50+JH7)iq^-#GzK&|aWxI1I9D*d3&Yorjo>+>B5356bN*Kc(;s1DW`~ zI0{>TZ8A-hX-HDuO7n3vPFAK0Po-fp^~RZ4i@M3wYQ||YU0=|$y~6UeXKWRX(mAOs zAPqZ9n^u&NKFkWrG$Cw|&eyKp4$5up7qMT!{tx!^*w0}>}aHD29`O5+La$FU#7 zepC+?%l8QO!`Kg*7k)vv;rRtm(W;ivPP>sz5_(#kNg4kP`%~=yQm*}I74*)fZ-4q6 zs!e8cpOTTIIZ$l}65n~BRx!n;3`^7aHDe<=b5taa%rbMNNBo2meg9+Z71+Y`L+lT* z-^YFr`(5mJu;0dh3;RvwA+5+auwTbEBrUDzSFvBgei{2EnogrddM8AgO#3N3g)+_b zo<{v?)SX7DwhWy{&1rOfNy`LkOzX-!(iovhJb|`#WT(f|2+~kl^Btu+r1B$G9h3?7 z9_&A{|G-|2{X6z=*uP@`g8ei0PuN-P7(2oau>*R_&71b4_fqO#qR%!TnIofERYE-H zYzBrHyo!>wsa!#gMwP3_#_-%xF*H)kx&Nz;#gCiMyi4~|*{PRky-jw3sze=jEp`nR z!nm4twW=y?%}UB!fnAPWhAkc~uMxy@)n|T*{RQ^_;f%vs1N(36zp%|rR$ApUM90zC zgV+Pu{n&ljy);cxWL^=?$J0242IDK#R9P$ans_q4gf`etp=JtQ<{Js^Xy3s+VMd(1 zfErT*a^}}jqB7{9)TXD5=3VK%gjO>iAHtXi-AD~ zucUoFhL$j_nf5=jXJqCg|aC) z>*1`6vks2wYjf`~#_g0UVlA9Caf)%orBA0sA!(dKoC2H_PCiZ^PA*OoCkH2ib2^T? zB*RkE@8g_Ai;)y^=F(G2T}bc6^zBEVMTw>0f<_rBO$+|SzyO0+FjPeAjtuK})z~%j z{84cCMHoWG4a{`9{Df zdIRS~oD*=4$2kt?Se#>Uj>b6($F$y}KX8t~IUMIOoI`OAq5DdlgK-X0Wbvao2jU!n zvp>#$IQ!!4gR?izUO0Q=?18hpQWZRovm4H?IJ@BNjI$GtD3k!)0cQfv_Bfh{l6PC2 zZE&{6*-8&3YjC!trvw(!JBL2$@PCCqBSRG~rtKPtR=XRXiaBju91?Og*n-pL0 z7|xA2H{dMC5oL!UGdpJ6if1YOdn(q}T$&ybmxOHN{V0>f8~jo++M@iTJFEQ6mTqNYsD z$C-yS7e})*BcHsD4pptwa3_t;Ts6M<*`@=z};9;M|XMpJIw1!MPXb9-O;z?!wV})0sFmls&7+aX-;} zA$|MM`W=0yc@@nwk2Evz90R>nzE8=Al&_%L^lhnsaI#_csE8VAX0}1J{SusNoGKiR zD9VQ6U`hEDta4W66qOU=7vjvunS~=NC-3=6m8*Jn9?rQqGjYyQK)$NY44mmW({Rql znTm53j^23&9mmsj294SnbOsHk#0tbvja_&Gb&B|vTH{NkX=|gaoSHLe*;xTO+DZ8h z4XbF|fJVV@I+T=|zfPM?8wpc3`PHE1MnU}?jzKWB0_SO*5yG>NNw71$BSh&u`gG3t zXY=5_GK#ll;8_NG7+k`TVQopNS=nB8>K~kOxNuZdjeK+ANIiWL=LsD3OJ>L^`rR02 zW~Nx#?8fPmvq1Hv6GziLLZ}_54W|`HWLw^59B`U&M7HH^z^T_0gN8V{EA~5O43XQq&~v=8RsXQEKZCQ;e(T1m|O%6?!`N z793^aLpn~x>B0FE=MS9KbnQdaSu|`x?^^U;K%H3S5A^9soeFF!1JBU)AcL1Pluwz) z8tp#)2mEKwsQ4NAW{#qNl_gLIxaF514N-;^(X{0`++iYhlQgM7Q zyZAwzRXE?_e2eo9&eu4qzbmOWUoOykzp>LdQ+XB@XVE^F^0RO^!rc&e16&E*@~(%w zF77(GTAYw~E!;J6i*bwS`9FHsr2RPh_NMQ9`m%aia%8o;o`I(s_>;<9nk%R^a&hVu zT6Y_}K3+U3o<_>K*x*^B9HemzaSL!$xCWlwn{o4Sb8(ZnIk*Yjaky*XXoc}4oWF2} zafWaPaRzYuar$t2soj{C->K3#t^=7HQaz3~ll8<&P*3X@a;D9iO+shQM}V({d^$8m zavko0iYWR8_W<1eareXB7k3}ry>a(aOz{J_d*beayF2b~xVz%+g1a;BPPn2(Q*n2| zoq%h)xq^r2k>L0o&41D-9l!PT8L;w9Us@@Ut=cj@o@2X*q2Q86@9 z&LsvzS42#i*cNvi+^uo9qWK0Y6|yDn7P#YaH^!hoS zmfg&Ybg1!H%1rN{cd#*R)WfuX2`L&>`wT%%mdi+efO`b);kbvz;HusU zgBx*gz+H|jB6l{f@}(MbE$%hAS1S(%4^pMzt8lNxy@F<)29kFv?h@R^xR=vDp9+O6 z!d-~F09Wiv-ubu!e=e@rmAseeO}RJHAtX=4JpuQ4y7r@KDoq;Br_!K3^;4;vO06lG z!eeObqN|#kskH1)y%xJ)u@(5E?+S83D^ zhQsL4d4}H^T*y$;JaY;|;z$1gy}6^JXJnbVLeIQPTf;^gRRt8#@i^|2xKH59@fhx- zN|7v+k$NBZ5!{DyMc}T-eGvBn-1~9w!@U>x9^AW?s_19jyKwKsy#x1l+}m((RdBKH z{JDvCofe&j`z7udxEJHj!Tmp6(J2LL4N=I>#+`*LMkV+8xaZ-Xi#rqd9NZbW({YvZ zY`rh{20BiqOZ!gGrbXJG&!*vQ>d&U`Y5{h{oX|LnFh?Gc~Z*WU26I+N4&eh8pcD)6SSZs8Km@Vm^Ej^{Xf! zG9NyHRx@3Y-pF_?YQ*4-IFMgXhai87?&b2O-jk6ml~ME)?l-t!<9>y^QUS$6yBoI) zw-Z-PNqDs5w&AwoiYdw4j0v8L7SEUxcR!9wQHEtDdC2j@f z3Mt1e!!5FDOK9Bnx?z40qfcp&Y(=<(^AnFs!1ZwrIZB%HO1+Eg;ySoCu7#Vy)ssKqevi9K@kM(4JKS&OEWVfO=~PXp`4%dt zE1*F8%rqUS^OCw@$qY$}bLrogexGvfXnThNliA#37#hdW3L0I8|3Rz1Y*ZYL40D;m zv3(&G)A2N1e2)7W?x(ooWh&xNa6iUff%_5ehq&s))vVvc)&F}3_ify_OlGulG70xh z+&6Gv$9)a=Roqu_U#9CoT6A7PBKAtkH&xogqiETaw#@`8RU@z;pwPS@X*KmXaUA7l zmL{QMD57IT0vYLR%4pIdi5u|_#XAJ=V7!BrEcvC3)H`^l=!(qPTYL}ArUNOskM>1) zo8WCsiPo(PX;$~R8-1n&%}vs8iYlc^@sNREXkUjRDT$PR_9{yM0h$X&MbpSG3k;f` z_K671jqo<=Uzj6OE|3Dq;FzyiUpnUn4;|}2VeCIwzIC^Jl=u$|HwlqgI; zVLc7+Y`m#>XW^Ym$7H-S@J`2@f_EC8S)<5Zh9|%$;Z4Ll74H{4zGLUNkChMow8I+{`~bl9s(`)6`)Lt%hyo?QaYtk|mC%(zLVm2F4@7Plxx- zTuV-ijQmo%Z^koCF!_ay)Z2LbQT6s6{SR@`8Kt7T}31DQrI8JiNJh zmntT`j538?f_E|A96XKn=O{R*liqwv64Ypn{+j;1>367CJsfGpF{N6f{)=WE_m$3% zPsYac@=@_La?0f+_3%Qx*?6&|;=Y z3Xh;|JQYKfz?WuW2c8*zO&o7}#HS#gMN9n!=?_s=$ z@E*i_0PlXh`|uvYyBF^srA_{yjMQ6rcjMiKcPHK*ctTqZ;Z{67tYW-LNs2-|b(1&X zEyok76`ITNuEV=lk;QuQ8hSpWH;>+r>D!swm#KS=ev$Sk6`yyWNmS3^zZo26NTYQ* z!yk{0=i*WEG&0NL5j?M^`3}7~p*t#;82fojl&|yg&cizwZzi58_T3U4J|H=dbr zDhlvA@kFDP<956@yjFTXR7_4gCDNoK1n^pQEZ_0$JMa6f@$a_IXfyy4dCOmOC#Wv8cHdv2WhbIoFkQ%&d zyed31(^GIaP&NYpx|$1twdr_{rkPZ=QuZU=s<_Io5cTmq zdZaHdSC8fFAtSvPeP7Y9D+u18POaRf`!xn9GC0VPw49Yu@(;%7rK4hKc3)-*1ACJE|zQk>X zY>K}L{>Jzk{pH;V-^}|J+=agZz9@hy`lopR#S^&`upi^C!21aAL%a|0-p6|n?_Ioi z@ZQE#Lsw;d1MhXb*YIA&dj;=hyqEA^)LV0}!FvJkKXe_A_dMQnG|r^weR`AhuApy6 z`fjFw5Bf7SAFSlb_34<&;HeA_F!T;Xr40Xrvv608ilC8Vt`LIeziBbk2o8BUwCqQP z*h3p-hC1h|&1mg@9eh)q$xmezc{Ce=ulRP_uasSIC;oc)>*6bw7?BWL3x7@gVtg?o zdDHlX_yzbW{Cxa8{9ODbehz*De;ocAcz@&lg*S{hgg1ycfY*=Lhu4eOgZHPtlY2E? zN8tT|w;Jzvyx;JC#rp;CXS|>AvUs|}>Rjs2rN$IjkpRr0_apkGCGI9_U!zr*?Whhq zCeKw2p2DD(WOR?)zZm{#?4I$;Q86^~%ax`U)}2f3x#rEMQd30DxitJPU!hd6Z3XSL zR>_xV#+SrVbdIBT6?LW~O01$x3&g+Dr02&gAZglPjwy?L?WpLbQTQE(e<=PT_y^-3 zgnuCZ0r>ml?}xuH{yzA7EE6H9~GRt2Blxg zE}Tc#!wmK@^cKyv41YK_j#rI}qmf^(`d?iv{1pEL^9}up#D9<}Gur5Y=xg6Gj<=u+XKv~3L?w3@ZRyuIm6$W9z7Be_2{W<*Jw z4HAj(C=v9!*P?}nb!h3M(TtlpjWY7hFrEGj->{|Re`Taz!`}^mSNvV@cgEide@A@r z+Vk)y;BT)aMJ~}BXphI1JHU~EjU9u-p~#a#VAOp9-$Lf@ZH`T3N~F%N$(zUH*f$G-&s zVtupV4$91*FSs3l4!-!B;xsp-)U)wt;cIS2A<}%V5dB`j{}2B2_|M@>SU(c~8T_a9 zf?U%nq}HTTr`|-Pn`v7`i{Yh(htQ@SrbD#W(3wNo0Ck36C8SSnq(s&>o^NT?ot+~c zcoF_X_z&VgfPX*!efWa*9{juU)fJnuvOv|)rT2aMwx{m~`gf!M2Wr&DwTn)ZZ#UDh ziBjdRq{>K?sXfL<@|w|*JkZEB*O*rB-+?cDZo|J7{}%k4@eNf=eqvt#D*lc5H{dVF z7he-t%kZzmH+(JeF}{|R6?V0JMGn4dxAu5kgD;A$;HCIW@E7A>uK40xsZiJ={Dt@n z@GqlFRsV7P$M7G;e+2(wN;5Q^Ppwp`pHD>_b#k9irMQ%C7CxW45iWHwZQDX8<#}`_ z!Mr+eFO!?YsTzTNE#vuyp8wD*#p3VLw;g@U>ED%3Ewb*!fMuRoS4RGG4EE6V1j7l2 z-ya*tYe&V=$T`;<9Lp{0#?y2@Ez-+#zTTf6#<%b@_&?(Rfd4)ID*W&8zs3Is|7-lO z@K@q@>T`X~qZN^rS^L?bqUqN-4MjzaGC1 zUpz|Q8hq176ss_+@U`x%unPQg{4#v;D0xfpzr_CnUp(ppr7ckLe~$kdz7)f&S9*$` z=PAq3`>x59W+twue;4|{XJAL_-=Wnv!6AcxGISin;~0K#Z2Ybp6+a`_T=%~goqGlT zllV_irx_XD72iV3fmCYX?xH-O&Rkeby%|u_x>G{uE3^$vJnjNo252;YB0WU&{j^_4 z#Vq-XZ>D?}WwR(Xz5-QlO&ZOjMF0L1eAV3*_#feai2nip`}ptSzl;A4{@eI(;lGLh z2L9{#ui>i|s9avge+gd~C>R1(Xu8ter8Jlir;4ecMZMPjW>I$m+ zS=0!Zg9x@E*qUG~f-MQQAQ(@uIl*QGn-XZTRor+xf{h9OMX(XUh6EcBtWTf|Y`O{7 zC0K{nS~~Nk(%3-h@09*R%OJjo@8YZ7+loxCkdb*MHa!? zbe>7qvGV1bm!*_Z5i%Xu{*e|nYBSUn9t10?$Rk*rU@d|*35p4{8u%;?T33BrN%AL9 z{W3L@3Rh8`qky~=W$kpF&){l?H1DY8>37CP@%m9wG&0Qf2E`yvP)JZfkfJl6KtExy zlpvQNNsvR3FrWUKU>w03#<>Un-}ryYnK%qzEq@4K5Dnn>!4Q{^peags*X1gVHbfKjZ&Ii(ZkXQ6VvYgdgGuG{|cjUwV-0*|L)#(fj~a zvuR&Or3om?P%)cw<1D_3GGnJ4g4vYLrt2EIexUa)`U>c~j((jD{*HkO3~1s-hZ;0O z|4!3KH0rLnx5vhC`KTBg8D{wihO-DRAUL03WC*^6lGz0363iqxhhPT5bb@Jgo<-Mj z1cwkDOrZLDAb~F6QA)#kQYwc12z1q(0`?);n_w@3Jqh-pRw297)lRS*Z5aZy$elF& zC+A{n6t@e(&ICIV>`1T!!32Ws2~^j&rOu#Us7r8$sL-^HbhD-?A5d7HVMPg@#~$fo zifJ;~B@89Z-;9=F8fVjBf^uJ{=Wp7dq%S33T7t|@^nXkBTQm&PX}a>_TADQNcr>-T zFYc|ek-TA4B#kU{!w8c15!_2~55e68M#@W@>?S{?`9a#Rm$T?c^U)h+q-+@lHxo>i zoxGpmEP^u$&LB9QU<$!$1d|C=!6y++Bsi7e1%guuB&!`ya5BO31Sb(ZM{pv+vjis) zJVT(UrwLR{PZ1nT@Fc-81WyngP4GCuQ3Q_>97*sf!4U+H5FD6hN;Q>^tX@5nBDuRZZE2teIaEH)1QVoifCb3dRzP3en z5g3Y=?kBi|;C6!B2yP{~g+PekL~tX)4Ft;xM3pWiSVnLifiCxWh~OFmDe5vrIQapA z^c*PFl>{P6N~n%y34s99M5Mfn2o@?~(GLU*2s9C?u=xb@2<8%q;L3Z6;*0gIiwWis zTqIx0qSU;;ct5(9(WU)*dGuXF{|*YvO;9Zc`2%&P+e+RiW1N($9mUXJwEcq_w3|jn z)5teBjnvNzDKVcexQWiG1g{XhOz;wchRPGD)||6VSBLrV2sO*6(n#Tj`-83;*3GMh za3xI!?Y!M&By?iRye*xk4-!nLY&f^td@Eb!%GBnKa8)MhZn@7dc$Tc_ruVxl~ zPr0&DK~PRmMj*l_ZwbMd1ZEbzST#qiO<|uCj7$S%2tFkcyHVID1RoQavADqW2MKM~ zG%|1gQW=Gcc~Ew)D%b-A_fvl%bt=0IwQ@+pzmT>as8VzIh7R4Qm4>Tms;A{|0*gSO z`H?`C@_U*eq20W`u#3*K3Em-io8T>iHwoS#c%9%if>#xvypyhz%ok04F5br&-%+KT zYI3QP9=QqBiVdkteuwIJX$?*MA)w`lA%iFWjR-d++<;I!bH62Ak8oY;Bf@nE*Ct$xQ12`z)EeRXgi>y*6?`>*^Y`+0rJ;eQ zi)dYgMw79$w2o@a+bSAVcpiaE;1HNzJl#w69GV}cY7Xr;DWJeGh2p*FeVx7}eOFN> z7G%N_+F`be!R@L4K)(E^89IU?snTs38^K#gMbOAJw;BX1=jesW_o$ddxp5Y0YG)3m z##wy5jFcI8(l=0|PUs_o4++${y|1tWwX*l-h{Qv5olKici8=%AQyQXFP?_#L zS2IUKO5mml^Qjmh%p=StOcLf0CJ4t7u0fz0_!q%2!4Sb9!2p5!;6A+|Zx@1If*$jg zg9!d4_=8|I!S4jW5r|MJXTK2qOzSv;p9n;#OzsP%=l(W&U!yOFJ|o-8esUfv5~xOR3wdbbK6KC854vAk;$i_2>Dw1Rdz(^ltJC+WiplS zq55K)AEWAG+O>1eOkpJ7rNY>W4`rmDq}+IlzN747LjC3tsw!(rL+;sxvj{IxeDN|u zO=>9YJVGO6q<$crNq7$748rMz(+JNdG#sH|Ii1t#Iz`Uh1;)6HP<%^|?nAgY;a-G$ znir&~RoEVch9srb21L6&!d(e>q2^-R=FoeqiCK%j1brITR5vw8@fHK$GAM!ldAi@G zQES-!48J-ynzxUNrjc@PH)w`C6YfN~BjFB|w&}Y`9bMQ-cG14N!&G+Re#Y3Bjtykz znnn9@(vo|P?7W?2B&5seVpl-}DNp{tsMkNei||guI|y$lyp8Zy!dnP$ zCcKHzbbRSQX?~pcTjWfdg_OLPWfU9ETJ)`PT_+=@+eroYS%hcGS8zSy8HA@3P9Z#v za5AC($0Wjube&45-zkJA6P`qPBH;;y#}gi>=kj!X!F(qDH+|#iTS`Tae2HnsIE#UA z7~F=zALy9HkgiPlis4trM)!_U(KXV~9V6%-OLz?7(S%149!Yov;o*db5gtl-2;sqm z2NB*!cp%}uga;7bL%2WT-Gm19-1*d9LhU7#b<3B^labW!`<5g;amsMYW`a~B_m-7L%!jF1w%55kC##OJmC$p(|-^yClsK|sJ?{eCkU@2 zyp~Y2QI`;2O?VaIm4vE=@-8J@Lb#Yv^= zq4W|;jI-FxuccPesTTW&oCRvouM@sT_$uKmgfA1mMAt;Z7YSb={12g^d5-W|!e
    U>I7DBshuA)RYcIY6s)Rp{9^ z!d5~Nr%MT&2_bAE6k)oQ#=lLD3@1(v(x)qwme9H}{ns(DHMQ?BxHWZV94g*KM*fow z^)md@*a+S=DuPA^x@!c%2Euy6Izq8DWvPa+ny`w{u(ae`lq;-)u$-`rP?SmD62dPD zzaSJ_lJ|2ZEc%-8Gr~^^#g-KI3E{_thAkCeL--NlhlC#xnh`AZ9ic|BnS}2WzC)bv)G)BGepB%`M!UssTFJmNU$3K9>H0Tq0ns)@TN70>9jfwt6 zv=PyUL>o{uM6^E9dPM6ItwXdn(OOgw5Uok1?LJCbM3g2fq%I;VphO3!{?xnkcBDy% zI2&lZl%{8CyHsJtU(@?Hts7CMmaPjaH3n%<%T^4I3=F!j=OTuB7=CeV{O%bQKO^_t zGg2ob!jLc^^a(v0E~Q0m#U*s8YNt+U+SIF-89_IlN0^~nNAd;lkAy$a{4~{bO^mjR z&!uuM71|?j0+Mf1HkVT4Ec%KPV;5gd=XtVIDwQ{77nuGbc@K4$Qmdy{QhOMw1%EY7+zXn zI`87$>HUkocj%u(Re}m3DOCYmDkg7+NmFXlRM4;y!!L}D=Dnk$X(XR}N6`G6@Grt) z!Xd&z!U4j5N|JrtPlV!l3X7@V zl!m#qBxusbbq28~LpY@dq9mnB+Fz&1pqe*9MnXF<=Fy;%m`95a(#@mJ#3fo~zQ|J~QQoe1r6{U8e-SCGzEu|!>7c(<|B;Q1(O-U!D*I0**XUbWq8!SBYLBdYMQ}NWi~H^a9a;6kniv z|2&aC@+{FaL{AevMf4=m6N)VUFVW*fj}fVC9wB;|=pmv9i5?)jpXffKdx`ELx|`@O zqC1K1AiAB}`7~&>H~GrFgen7fY8-WWH2x-Mj@Io}E6<^Qgr&};%cE3d&s7Xu%D@#2 zZf??_#*iVH1z*ZYzd-drxTo{MQE@a<(1XHJ?@6ZMV35-4O(`|4sc;X<)B@k6X+Bjy zQ2h;!Cew+1Xvw8vKFtC~l~D+({)!(9`lUomh!zuxAIn=!R7Ipk!^?;&h{}n|h(xI5 zEg|}n=nEpV_LY2)9rg|jXy0_D`tSnGhIcIwywQN|1A2iWZ)78mNK{*wc4M0 zh>4soBmXfvpBlSXJ~S$tM(TOU)XKIj)#Ir#-_N~RMoL#}=F_&PoXLC|euG1(m2!p6 z=o~?McUp{$kdj6({qdiPej>^e#dJNb;JodLA{sBFri&;f3W$6KC)9SoA^Ms~m3Spl zH<98y6*I1pNE4TORXb4|Q7cgkQ8QHvfvAZ{pX05j5i|QP%lqv9`R7(4ZtB@pLjjub%~`l>Nn!GiPs`tlk#VlHQE@a9 z&m#uMD(y1)nRr*?U8q?=yfg7mlzvaVBk>Nz6DV6rygl)D#M=^YLwz3c*2G&8Z%Mp` zGMWE3@p$6Ri8mwGddptaX#{ua*qpNUiTLcnU>LHT% z52DpXzZ3mN^egRe)1*#Yd-XS?(Lk8DEe*y_>_zK0JDfad>Dy^yMfRO;ZyLh95v z8a|S1gn{Cf^u1318dPeGnZv*m^XwD`SJIGT=s}Z0`~MCd8@)$IMbF4KkB-#Bg~S5w zGGc8Il6xNUT;fZKH7_CW#l&-nFCrG3lXo`pEaD4@#pdKSj4b&w@wvn^iO(U{&T1u} zPCSkHY+^HUR@6>>7O|GJ&y+9KC8J=ejI`;qidWEmhY2xEC1)!cxpOG_o_bTS^R?)* zH%Q&+pYkPj)OkF0W*jLpeNE2k#45>L;v{hnwGnZGc${JiRTXrRbuG0PJ>Stgl}6Jh zig%^&HJWr&=Mn}kVqh^9VnE&SA|VrpQSmgg&SM78_IGGnNS98PFQj3i zUYoa#jD&W-c_Wqg0@@bRr#p=Q zrhhR57cy`;gPTy&PD6r*6?96$#ld4^`1q(88VTp|kvh2_@xH|S5bsUA7u|Ofn-b4g zWz_jP18UBeGIC9Sls}UOGj*0fOxr$iINDAvqz;milsW{{zH=v0uCe}Ix_%?RiTFn1 z8;F#Lp8yM{LH2!aCw- zh@U2Yiug(5Cx{9nlW z^j*%tY$~KicVh-k8%h0}p$BB=+(yGcIFs|lXo&7*q@5>(sNPfjA@P00_Y&Vjd^ho3 z#Det>;@gRDQ+(kE#J3XPLVPndi_C|&pux1lT(icQe-33W)D{@$5FCNFSP9@9%FPg; zV??K-J!q+uT_`I39!=6%zlb_*tz1OaH`JRqCib9i5w(kmyNSDqJ82fY8p|&yUPQ$r z$`?_#h|)!rTuzCSeN4Q9_#-;aY(-M#p#SiJajHDtCw`CkUE+7BeU^6V`5QDJm*{pY zHO=%dVqg{nix~VDHR3?qF?7G;3(VwJ`uVYI=#!)3Y2=(IM{4NX#BULsDTbmJx=gQ~ zUMwTkE~CJ-*|e!v#UIdp7d4mDm`7zZrDhQ--w?Lk*|hCzJW|MbBy~e_CKD#SpAJ*C zi$t30Wf$&D{5$b)wCn%=N-X8J0_$hupNO->;!wZSs1S92A#p(L(;%-$>=HY~8uaBg z^D22;6K9BjB>q9Z#O^e!cDEC^5w{X+`cmndi6L$xZlvdHn*X3vwBjV%7SlIO>t>X` zMTN#SO%io7v@NYao19Li`yZ@~KQ$_zM$&m|1kVO4m4SL%=IWm#)3f;QzQHrupplv_uHLeF_ z*CV?w*>$K`pX}OX*CM+n73+{K)*F-K$QF@JlPx4$KsH4-pRDO&igqKbYn7#{F{1qg zx>hSJZ!6;Osk@xo2I5u3-w}UH^Dc_XEuwc7eG}y@)KIQHG#bY*U|=DG8!_0y&^8R+ zXCgnMW8T>H@##_VGZN0z2ESVU{cniBCjN?eCC#r;y_l-SRBDi0Ou4C=1tVkNL6j|~ zbTK81=`;gy()4RZ%`#GLCXs2UX*1RozfX2=vU`!;lkR)S?m>2UvVve&vb&Jone0wv zcO<(5jlyRF+3odE&Uji*B)c8iZOLv!c5AX*k=>H)7L=}3eExK@GLb~Cb@Dj+Y9 z>?UM4Ci^e4(zbCFSwo5QRZbz<4X9k7p06lvr1v{2N7_~keS`EbWZ--T7BILWgY8sl z57E60n?67Pk+HFSW>hSVeDjRKGOJ@2ThY=;+(+C?+(Y~)@gKyiY5$O>#Wac1Ev7+x zEDf%CTT*B2#BS6sre-nCuTs5)Dnr;kfvkrnNd zcOKcfWG^Lq3E7J&Q^*{$7m*e1l6yAUS!6FDE7~RRd1TKet8+Z_$(}=Y23g@cjqKTE zr;;_Daq@YxXOcaGo|ROtONki<^Gq*Vq~Uvj{sj!2$AG33-lbZ-qxQ?c%rq9!K_Avd7TVO|P^e8a|Y|fc8`A>!<%RN;Ka)pTQ0I|HjTcOp0Rr`?|AvXAPLH zk*i*F&N=6N%{k{Bu84{XC@Lr@C{a{IL_o#t#l+^k``Mk1CJgvPF-+&~2^-}7*4+2c zeV+As<~dz;s;0a8+o!6|sc>qblR=5ji3}a8EafXlM)TES(KNQrt2tRTx>|7uNmY#F zNFGb_7?MYmJc{IzB#%&HiKNH~ewl>jFp`IoJcQ)IBo88apecj=D4J%bD4a$`)RqK=KE-hifU$d4Ars+$IxK(sJs%= z-_o8{u*h9iqdif?JBf4N%M2)?(z#!Tqyyla9iR3(z zb4ku2X|%QCc9OG5&LsIF$rnh@AUU1n^CWZNNs6YK;@wDcD#>R_K0|T}$;tF7$J0tF zQVyd#Rw~t&G#Wc9Pgd_3+I7f{*8ldBK8W`H z$VfC(WOO{;ozw`oGuRDF%!4M4jOA;?Vrguh*9?~R59mJSsy_*m(jXon zDHI{8I;1g}2k5znqyTw`lkd`F`=- zfTT~?CnP;;9BL#lTPjQ8wzNxD`)GK8h88;23rXa9NXpIWSZc>oGnVSHRE?!_EEQwv zG1`9qe37z;%o9upSKNkm2-d+^2Vt3Jxa1D3156g3()t3{{#g5A?TfV!*4|iqVHuUN z{B0~#$dc=@cE{QcYgepYuy!WZMfw2JUz0hVOsFh{I@A1i`mdl;t>h>YGHc?s zV*P{eudueD;wv~AYjYK(=pwAmur|fo1Zx!5##kF+ZHT2UVhWd=EOJpM>-@j;W@uBJ zHdegCZD@a+hOu<|iskK2yu#*Ou%t05EJSa|x(!PMRf^t%bu-pYST|y64Zk?o zV+lxUOb=533RPQ>HZ`iqG`{(2iN}!Dq{HP@2*I1gE3i$%xzv9`Z8Jm9jf~(M!y;&G znm2L?UPI4+u&&0s3hPR&E3hudx(w@5EHf~cr_X&cmJqrS>jJFvu}tTi|B^`DSklF< z^u4HSiLqqLa(%W5mA^&D*}BF|MU8Dqz54G{6fC$}q)0v{cQRKaC9wglJgoKUNMNl; zave$aE<-d(rUyw5knAU!C7B_arfUUlW9ikNtg*D@K0+tW67@f58cX{-bOzK3)v;I) zV5tXskW?q>{q>l#189_&R-+XayiE31x)#$fRL#Ia-oHug%3y-V6(szTFOlbt={=SQ68du*|?%yiugI73&17*z!b67-; zwezMywE01r9;E6I8s+yxW0!Fn9@Ik%w-(9M5>$P1X>ku>JxJ|C)M)znA*ysNrZ}Hr zeM;hKtQA=5P{vRp<|kMmV||46A=U?2?_<4(^)A+Otaq>^@^9(JWfGxjSW~f{#d?Om zSy)rBCS&PlPhmZ&EG4&^eAi%2!g>Phajc106R^f(J%(j8r1GV7=o3GJH4f_`Fr8Cu z3sR=BFFS|^^**Q4bBM{gE4_c}S~Qyew;8yBu2Bqb&7d6Ctzc-{$WpmvSPYHLv&3MC zj^g}A_b<58JVf5AjV)Z{)6u1CEGrVo+M98%pb-As|`!~l){p|W~?SG4Us6?fK`uGhb0X_t0gAWV-&JH)r3V@ zuVXF5GAdR6T&x9HuVSgDyo@y;>m{stSmMu7dR+IJjirw;6YE8+7qDhvO()e(dSB8y zMeP(axrX=+x?ZQPoL-x5In&Z=kw2-EvldON{EZ%b>98moTW4vmZaC`G@;`UdN3ERBIk=+ucn zOlO4EizQ#>%FvBvK1!j+tpC93#8S&>1SW46tahv#tZJ+(EGbdSU4gX{>q{&tQHp*} zr_Pt1LXSHBhxMoY9D4N%6^#=qZlcd{O3e64+0}HMr)#-(_`RZQ?0nKaWONw1v71Vu z^Fgv{`ro4RJu2U)M%s{kX-OWZjx3MMhDFlYILi!@-P%aqPj$NldswSkl22{TWLRF5c6h@^_-r z09oRbct+P^d6+Tr!ucYl--+bU7KztkA3+Klby`a|nUlmTFmF~mhCZFZl%mRJKs#8~ zQf|dyn^KC;VCdPAaeZr8T#dc+mci9N9Q!crL$MFRJ{bET>;vhWi)~c9yl2gG8nO4s z-Vb|U?0vBJ#@-8iPwYLESaLJ=?z)y;g}od0u5_G_y$klv*gIkW3)|R#1t_BPmCV{e5mOM6r7e_(5{>~w6SW5qVd-V9spO|VB{Z;ZVW_J&F+xEy-} zy7jD_qIE8I9J>s=6uShwn7VvYZKSn$P16u3lK5W#iQh%{MRKD~zjirZN3-d}@>2}9 zQm1uS&y0-W+rwgLET6Xxh80?_=;)rM|DyeUI#bk|FI=jT<#AMxqiP)XP1rYLONo1g z#ItlhjeR}#b=cR^^C0#$R6K%xHMUT^68j2lz2#-tmttRneKGb$*cW17fPKDhT=o>U zsfL9wVe9?%4bQ)ZLDk2ziC@*uG-_s+9Pv@L=c={}#HBWc$*sHF^COM4ZATQbEESqVQX+wjC-)}#=Z+%sOVZV;O5c@RjQ?XCMJ{kKYY@=G`&%{0f`*`f*u#d$)2K#92 zqp*)uV!X!WFgm`zP6m{x`)~-(lcA2DM>FX3jqunmn>3zB??A z#?pB=*Dk(|E!kOyy%c*1wwkcTro{8G-=I%5Z<1ogb*3&vL6Rde|y9T&sdbZf7}p44lp z>Xvj>(5FL+Kc=%ETUwP$qBgGwTeG(ccVTy8C$KdNsAxMj*lpOFN0e7e_0mQaDLIwQ zSM(gFV7^=_2J~xH(R&Omr&_vC6SeAJo*o&!_lHH#*fZ}N^lHZI{-t>$c0W zmQXdG%JC$gqrxx>=a|$g`iyRo_q0fHEgeS8$}Xe22hO4UN2+8ZXeCxo1TR)v(OFc< zX}ReXi!AIR>_O}SY@w3H&S0l$UW=W=j<7@QoV^!d%N4{1*dDfvZF;}>j{9sr#jb-yeP8L^V|AZ}FTTlN1`+MwF*wVEX)tJ4Sp>MFi zrc)4nMM6iGzl?JL&i*+2;p~gE56<2=d*SSfvj>i8Li3(7<*lLPQk>mzcE#BRXJ?$9 zaQ=m}BhC&u+bc_%(*KDw8fQD4ZE?22*&1gnoGo$wK}`o0KjLhGvpLRYIP%QagR=?F zD4dOPHp1Bu#~2K;QFQD1oaVZ!;+8?iy(-VCZk0 z_5R_oxEWjKLxYWD}A6U8tF$Yw4RLrsD{$OK>j6xd`V%oC|Qy z$2kw@TpShW9GtUp&ce~7&(NO=XXBiXW7@L9SvaTSoPu*Q&Pg~Y;+%kUJkD`A$KV`| za}>^zIMTQqNbg3vjm#7p7Lz>%=IRFJ(EmCES1|AvgPV$xf3`^MQilG<48upm;%F?H zk8(I3j&m5!p*V-&98BLUI0xYzsQbqaJn;%TF4Hy7=nuu!B4wIZ(R8?Wz4Xw%w-~XU zu670;-_pAkjix3S$u@v}s^?#Wb2ZLYI9K9afpa;|W%RvDhXQ)B$8bz%oHq%_s71w9IOA|0#(4MdOPM~cII{%_U(NgLsP^Z%oCQvhx>WMhjI4YF+ z{zX!Ab|KwD(;H;E#VEXq_QU9Voeueuy^MjSRDCFRe!$Q+3_US&d-%z)I2!BalN^pZ z0!1ZQiSwmyTlxl$^tBgobbf@`pW%FpvjRs-nxdcJe2nuE&WAW3;JlCX9?rWs%W>Yp zc^l^~oMkvmahBk`iL)3-H(Z4CI?h6z*Kih?SDGnOF0_x;HLfH7^vNcGOnU7x*_YmJ z4Eqp6_NR3o%&X8_i zbQ;be&Hzq7P8KJFlg812$OiNk(*7A;>(TQX?QtBfxY8>&<22zkl9)-KMy#&H-AcD9 zyAW4zbOEmBbp8)_3*60dH=}z$B^DM?*-r1aG~_z7Q_W+4r>h3X$MJAn9EZLl>hxsI z%IYB&P7+57j>3N^d(6i99p^V%dvMl@5!YHnePOw-+L?3{nWxEgl0A`ZuM$hO*k=I) z7c;P!K@F1rNsTnc2_xh8>9F`2tL4*NUH%p47o0UXQm$08pK(-ue!`KqquyXGO=A9l z^F7Wg94+hmoJMiJrE7h9R6oDL`5H%!mzaIDm*e!}=!K*bi_?YEiIc#QN~~x*4mfQ% ztvC%-KS9+KBxX@*1glgFke{IM8U+hqFlgwUT2lw}9v3ODFzn4}(yFBWX)L8zYplK( zujmv*exqtb`ig0L0`~;cn~~m$v?}sbWD>M3p<$VF#c!qaF$OMT;0?MrqFK(CjL}+f z!N?eXHY|q5R{1O^gU90@hkGoodcID;PU9YpdlYS*xJS|=<`KAN+9bXm?qRq(6u5_G zu@AvL822FD18Gw90Nnj?jdbPfCB@hmR~zZHcu9&-9zCz)?v1+_?w&L#RkPw7;O>sQ z8?JnCin9yu&bT|_>V#-Tcf{QRcY9npmREE%2{Uk5x{$tWbuFA>O7uA$SL>Q*YI(6v zy17u-vI|IWO8Q@<8^}CKri1JWWP9j2#$>WY^7mt45rZ4iUeC}!4ENNLQT%*Z6phXD zxk0ge4eqA6o8XSZy$N??+#7K>!o7iRA+sT_+~t+x>YcB{EyukUH;#J^ZW->?xJtST zw**(!w;1;d+#=k|am|=$On_J|)X8Bb=7ttEI)Bfjxrdc0RX$KpPKI|et`1sJWR_)FaX;NFLOFYZ0Kcf&=vcj4Yi_b<42;NGrWWdiax+*@&P z!PRyKopOI5)y?$&6ZbT_>TplRJq6cbQuM1x%s^dm60UwH(l&{-Jn!yE+Kfq*X)%wM zGapB`i~f_uDAxw<3m90);D!t?rCM!ahx;z>a@==t-^P6lcNy+d+$Feg z;x5L019uUwT7-qT`ncEQ>hG`OzJmKQRq`ac1L-<4Y6#Q|8>J~vNArA5-(m*j>3SxW z8f{p@kaqI_jrr)6!{TP_la)E?n~(bv?mRuE@OhD<$LKIEO#D;x-0>pCU(jv7ahZ;9 zE~PQ2HQY#zX@84O!aWoB4BVWaydHfz-^A#Z#s3m1RP`F7QPWmWQm5ZnRIQ~}&S3>z zJ+7pvmV|k5>8teJV6LjpLVOPHY}{G6GjU(UH62RfbGS2bRRf;KRd^ciRNQBApTV7i zI~n(B+^29g@?)w>sp)d!D@5{4SX7O*QejYSDb=&eWK|qQ;t0G#yaK#@yck{{z1!ohkGCG~I^4f- zhj0gR2XOmwv$z@DG;Rtv!VPtsq7!igTrKnwXdbRhpBhcgRq1`Dofw@jb`~wGX?>EG zTul4n%{9IaO;1vlrhO${1@tVY@k!dHe>_RG{##ArC90mH@+m5wGR2)j-;KKFzd*1_>kD-1UZEx!jB@GOm%fJE#%M~l=qvaQBJ{uXs z%3(1ymP(~C)UAq6psT^$(!n)7SJ5vbF$r7}_fK3cchYP8j{6&~%rAw1#r*|$4X*Sd zMSnKg|G-_1`xEYuip964NePm(?{QbrUO`tO?sxRONu&7R;(mksHSSkb3bs)+h}nnR zi`#=MwMfw}+)i9MUKK~0kvQ$R;I`pvid)eZ+-BS++(v43pS9G8XiE#B^UyWvS45^onO(s(=L?Sy9kU%y z)_8i&EvZ)cA9z}6_!SjvNxZD*mKu`xg-G%1^xdRu;Z!HQbpC3qL(U4(Zb-UWE) z72;G%M4z?;39@LVW@Uw6l;b>(bz3DITWQL2&xnCj>kI=&vbD`|7XY=k=Pk{ z$IxozHFi4Q(Rg}wsp9&Fn$pAZ4#Shd1mQ+`i!iUXAJvUyvcaF?NfM9;!VPPf{vT;9><%AHvw-v zp6N|WtMMMidjxMB-otng;XQ~q7ViPPF-nhrY)FiR8cemZn~`azMZJwCq>rUP*N({; zlJDbGO)M~Co%Rn{|qbS#ao<_@LnhkQX-)Ndlr7Cj? zJvp;Xs~soPz9|ipsh>=p$&}wk?PO}S_;@mjSH&p#K%_j4_XSl`@T5tJs-5{1Po*(U zPeC;m;(dbmG2TabAL4z0_decxq~29}>Gq_ns8&61B6|dN%V>R@HuD*a|6t%H2G65+ z9z&xTsva51x?zztmP}m^$#?OVb{2_aXbb68J9s)>&3LEbol2h>{w+F=w#l?krb?frlpa&u`~yYS z*ZT5F^tI6;e|=|&k@t{D@hGa*|7db<5!ErWhm+O7_c8PvmASl$0eSE`mqE1wdFlt?tq+c77*b6J8^p)~btF zPii?0Kaj4VQnJ`c_As(-%2=pfaevY$XE>VO|2KoLF|;v5RU^w{!>|Y%o2DTri*
      mPv;Nn0O7TG`ijr!(IxMTl&7eCh6)oeH|j*(3?~*;iIhqVpT~O+ZyMfI zJhgEG%Di*o6go7hawdsm>D5Gxdhm_#G{U9yKk$CX`wdUe(T)RgjN({1 zWDh0VO8?O$>>`8hK!q*6; z5)Po}UHtv=G<>Qk+EzU4$DJ~mD7^LQYQZ$_>=ITz<*r#EPM)oBK`#Y@%WG7KZ^ed{y6-H@gKrh zO&p8=0R9;K`|WGF8n+3b<5lFZ^OS8{}%k4@o&Pv5&s5!qblX; z9@pVti+>IN)g<)ouEM_({|fxe@h`)_RJqDdqfWwo3I4_S7vWz>mrmQ*gxYnqtfJ>V z+O-sHDm6y9^Nl3OU!_8I`U^7Gkg0>&WSi)hGom~tmeh*mtzhsB2In)h0qxBrqt`kt zdd8k*5bg}OoMEd1B-7vR5&{|f%g`1A2!!k>pf7he@t!lOJh@n6J$ z0e=Smbo}S>pTnO<$DO)I;gfWl);sS35+~5Rn}TI(9OZ;y8``GP7*eODQqyRerj(+i zs5c^3EDu21)bTkL>Y;1N9!$287VU2BRbuJwCZ~>jnoWHdL*+F7GBSc~!y;&GnKpx9 z^E8^L(zJ%AY4p5Lllhi0`EpPJr&1}cVj2yrX~?m2y4_~fPo++{UqNCK-GAasBc6u; zDgFw44J<1B3BEBR3XBF=@($e^ndzkCE;`R8aU#8HbpOI1!XLyR!0*S;;%D&F_$mAd zUyVy(${{(_m((JGP4Qjo*Wo+(Qj-L-g`dP%fBzf}xlyJgsZzf!uK+naVPh)QKVMCz zh8_*}G|+zp1H0;;@f$>ne`D};247+*PUqh^=agYFG}a3S!^WK1cLMEZOfmL<)K8<{ zG+%KQTPw;1NE?1Dz9#$>ZpLrIZ^V~Yrf5BW9eypS_SNW5`A{vkEU!0c(G9BbEAcg= zE9Of4FY&b$UQ4K+Q{I9vL`sY*SnkoSZ5&++md>Q(Ze0r}(Rm)h1_b2J@8;3~+ofGN>RDx{(6EkLDRY}q9mijTZd2|+H##P7rJ#qYtF;-sj-DfTmd zC%&2o(;LJ$p-u_%Vz!#v=csuO^v2JTSWI`4x*n>8`SWzksjcZ)3qBVqc}t|+6)Am@ zj(c=1e1gvNNt~o$o_1{SscYHE1n1~l{ymM`kp7g+m1L^Pj;BWb^kMXOGjP1g{|bYr z()KYGvZ()tf0T}4u{738hmpzPY=W~0&ZKc2!5IXn6P!k%_NRm36apz{n-iQwprMTC z2~Hq5p5Qow+-QA_R`HI(|AXLY{ND+V!vBrnNPLY`D(MJXgw?P3zfdn9C342LHNo}- z(v*}snqWJEZ3#5Gq^Od%BG{7PA8<8+I_T#~yh)%Fd@X{_2sYK*=eJR%yqge=BG{Nf zCs``Gp~39{y5^9HlevOS6e&IO-|lq@sPa70R9Aec^|Vm(JNjbJLlvjoo&Od*iYe6O+;K2E1LniWv1 z6$qv=C^M1}KT@RNLi)6#rj6img2M<7C6K(`8tSK0FXk=;I}_|gvr&*r zz9y~1=2noooJ=LPd1UM8Ka~3A445xpZgkty^=a0`Ui;6{QQ2(Bl%j^J8?YX~H=R}oxEa0S8T`giFB(hJCx zk-3aa1r0XYTAI~EchdTv((~qsluJjPL-l$J7Ick_V%M-J8q1~Ye<-6 zE+M#>;35LO*aZaV6P!nIE{x0->h;2%1POu;0`(P&LeOT?e z1RoH*Pw*bWy9CP#-XSo3eZdNXT+d?>EF)M-V7lqTdq}@R#p`4)C9{(3byTZ=F|AAf zAOlD1T2d`irX@VH7%HN5i;>an84k@ujqTE7+RETfI_@|3e1y)6shB~poLa4=Ufs8v zdkt#o<4crO%p}2|bhZ;{&P&YS3DljaFH!hcf?o*M5NOWp1$xY+LeZhLZ>MYQ7^;nK z7yD5pz7c_$g(_CxQcsOguA@eOljd|CiMI$e%p_(tt(y{55mXXLX%c58!IuPI5J*o| z)HLe(%>)GZa_! z674z!v5=vcM@F%CSQL#7)B8V}EPYBg`XQ-~8)d!u1H&5&T6kL@-Fhh6Doy{RCN>m6Rb!6QpRS9h5fTJYgR)7nAvd>@{So=+TsQI|HUej9({G{0prgGo%&Y^GC+9 zZ&)0SCDUi5F&ss>G2upp8&dfK;Rb}|gmG$n2(?w#RN?|7UFCm@6uu@>`kY9~uXLDc zkHW|3yp+Cg=-p4TynATeknnuM^9au+)a}n9Je%+=+J7TFlkg1lAAN<>buBxd@H9dV zYi~_>3gO9wClN}udXex1LZu!@r~zF?k0Cso@F+r!f+%_f;o*eR2Yx14O`t~PM_Q!b z3iKZcz9%(A?_+emMCKwgvI;dyR!RRs)Gen)J2(EIZ#jcA8Oo>WZ}<-XYFGq~9rIPL zKCdG9POnpR2#qShw*>M*_fNvD#4FyGPzw@oAk?^s4zW<#w;+@b=M!#5sDUnVv|f4* zl`krLeiIeierM#QBqLJru}JwJgfj`1F~|Rf(0r8AX@t`WpC^2dP~oYB&k{aEIE8RB z;nRdq(eVJCmnm=Iqx5}C!3JS5tCr-L?~@Zyd4R5Al#l% zI<=ys3AZEMmZnZZxdGH3buqWrpW++9jU;4b2Xu@je3Wc;r(=8PWT_f z`v~tf7(POH58>T}cM;x6cn9I_gtrmiN_Y$5&4f1*-biYI^gMbe(z}MtXJoIU&ZbFv zP8$P<)B1t#RBlFe$`(<-ilM*Z)%u%Zkuw&{H@W&N*}9(aI>KuSuOYme@G8P939q1e zLqZ+>BwQ~eyp&K)aQg320O}WpcHW&P!nwhNBkuq|?psfpP#tgf78!DtMX~mL>m6TDTpfUFI zN7FFcTpI~L)V1Ivk@DXOXA{m+u*764HG?K4#(ph*R-|k^seWCHYuAHiznygp&xLAbgx~BH;wW@q~{N zK1wJ-7)R$7dR*Z+k@%2E=?ZFkO#yD8sa)4QE$vuGO%7L0=Ao<+}RG|i%M z7VSIGFpDtvFI77=x7~y~aYZ~yYl5(YP~C^35ax8HBMDmxr7MZoOxQ#y95o!GXgy(_ z9vfRlsGeP$Hr^#v&#n}sy~K6GN=T?ZfmzaX>5@G=pUeufSCCyv|Niv1z^e?bHMh`s z$omW(&(PeFQT%RL6pbD8T@J-c-LP0kLyV%fmp;=-mXs2HK=?l4dxR?Fa>92A-zI#E za2erJ!X<=n5-uiugK!by>-tl{hlC4tE&q-1HNpjiuM)mO_%h*q!j}l=5zZx?L&t;4 zm9M?mSDHs%LDylln|`b0Oj;ykzYwk=l*Xj+&xET9e?35DQygxVvv zqyATPph!#(__c4?SShjZ=#EHbNE>q{e>*bglld=MZL9o}{{5(3PIEB>ztVF$-8VC& zncBax2WHilFvl~6Pf+EXMRCp1$b#amIO@4AxkOTsS*rJpJK8R4gdE2z?YX*5xs zPYC6NO+sWUS$t5WAlLesk6db|OiF&H!&rbNxxuZObY5jrj2IO@OjJlzK$K4uBg!LM zpXy$s^$0cRtgiPj!Xd&z!U4j5!mR$3cPB0DQM(OchA>T-B8&(nb^)PJ=n=Yv(r$&c zO{-045hiI5NY5snC37AX>SZn?tJRuE(%;O07G0YXmzcLMuV(02hUSbcliv@ErmMDYQjGvnT;J4Hh8z8TS`M4KojUPjeyqESThy}Fuc zBccsy9Yv*>8xWNf#fj9+DOyTYLfV6>mIsN<6-=th6TJLCPbmOM!D&K1d5}iVHGSNvyQs3qfoj_zp=gN*II*v#>oOs6&DS8xmMaQiIh$zdYV*<^i0~* zWK7j7+KKF?RO)otw-w9N0_H;)SVNzt)y|?-UQcF^jNp&MB4{j`9}R*XB95`{E&QlTxM(!3+mFOn8|=aBiB>?LGBqkkXz8)=s2 z{eKJ|t~ZHIX6P7(W{r&9Ps3tote2nu*EW`zS{64NLV;$ZhsiEbjg zk?01Z>xr%-(mk#rx|--JqAQ86AiA9BGNMa~E}>x?nvEV%unFyEbT039+UC+~hJ4B- z{&Q(I!I-v`&82ZJ?K{yhmwLl1+Mhb{R*?!xzd$-g?-OJ`B6~5}PwC&Aeq+uTYH8pf z3?8On$(Qu$NcX?d&#WF6OJm2ZHgZ^-E7D=q%q8&=6>^i9rCMPFaO@kZ=2GENX)+aS z<;-0ABtY*HEhl=1=xri1xKdIm5;xsJ`N<*$#yT%wL+^j+cvQg>GbB;^G|?oYCx{*= znn*N(XgtwlM2`~b^QbmFO!N@ZgG6$qcOAW}=sHUOi|H)7m9+0n#VDd~BI!!=h&qW9 zL>)x!%2Ko+b>czPM$}3qT}jbqQtD-9kdBn4SdHU{WG^DSg8scoXjj6)4E#(Ysl@nY zbl=9%-|)r!^RQSNd*ds)#Cy zH1ZwZo21(CXU@jf$!DDHa|hG&Z-!nR zxt{;uuy`6P=l}j!&&xD>GLPnY^i4yDM`gHn2l zvRwWM(Z1B`P{o%>ZAIb}QnJeLAf;`B14@h?Lc(Z4v9Cnp`W`EEE#90+s+4Z@ z36akG_>t%%A~k>?5WP?I9$jY4sU+Y0B~9-bT?^h3DVJU~-lTjlQmW}}Gtg5whLrXI z{6K1bQtQ!m4ADBGzles221$9OpC=udf6IlQoX1>9g)E#s=&xtsAb1V5dFxmD4rSbiB=JPN98EmcO~&( z&=#aY5<2&BG!^bACtPA z)McbDC3Oj@i%DHXN`A|wupLY4d{XC;I+xVHNu5LLY*J^L5^PNCdZf-Ibp|PEYp0Pq zmDDMuP9~+r*e5DW-YvAvCv^g;<4GMyN~)8h$B;Uj)KR3=cPM%UDbtIV9Zu>nQX0RR zPmfjx%E}rNyJ#O0Mt<|Zr(4G#l#$Zpp^(+ap>|T6k&`*hw* z=3TNEko}m3Jo?QrMezk9C0crN2rZw$Qd<5-KlAIb2pW6lS0jC?jYw&%QP1B%H;?s^ zDkl}E!XZ^AM!blw+(1{%MAZF_)wRHAU*!^vsidAIr4_UjNy+chWKz=Do+9-msY%LG zvPz`%2^0E`&Kq^jH|kxX+}@Yd?kZTY5mh;5O2{2TwIDc{)Ip?-uoRp@li)po)c&OQ zBPDf8(S1nmP5W-7_97*9N}N4N?M_M@sZ)yXLQ1Y~cOoTSN>PXORNbt^sPHA7^sXg) zKG}~*$ctkQP11r|89bQ5HFTWJ(Dacda_z8q8jEIa4$mD)?LcaKQc~XZ{L!Sgqfsr* zwj@3!CGAa2X>W3n_7{mb)y4*i9Zc1HnsvUxd>W*@8D89U-o=~I^`a?xp5a8KUM8jg z>uYIPOMX7*lA1$mHmO;pW|Go4q4`(^Z;_gzYxz&4CD;q=V9#VIc zx=XRV8KmwcrMCBWQn!)1mDDYyZYFgTsT)bj$HeuduA@y;T+fm=jZo=BbiGRUJhC4u nR;+y|ns+@w!Tj?W)J`jhp)m|SU%USF>C@L+ukE+0^{4+o7t) Tuple[List[SequenceGroupMetadata], SchedulerOutputs]: for seq in seq_group.get_seqs(status=SequenceStatus.RUNNING): seq_id = seq.seq_id seq_data[seq_id] = seq.data - block_table = self.block_manager.get_block_table(seq) - block_tables[seq_id] = block_table + block_tables[seq_id] = self.block_manager.get_block_table(seq) self.block_manager.access_all_blocks_in_seq(seq, now) common_computed_block_nums = ( diff --git a/vllm/engine/arg_utils.py b/vllm/engine/arg_utils.py index ca99ce80064..d7882c0dc67 100644 --- a/vllm/engine/arg_utils.py +++ b/vllm/engine/arg_utils.py @@ -816,10 +816,8 @@ def create_engine_config(self, ) -> EngineConfig: enable_chunked_prefill=self.enable_chunked_prefill, embedding_mode=model_config.embedding_mode, preemption_mode=self.preemption_mode, - _use_delta=( - envs.VLLM_USE_RAY_SPMD_WORKER - and parallel_config.use_ray) - ) + _use_delta=(envs.VLLM_USE_RAY_SPMD_WORKER + and parallel_config.use_ray)) lora_config = LoRAConfig( max_lora_rank=self.max_lora_rank, max_loras=self.max_loras, diff --git a/vllm/executor/ray_gpu_executor.py b/vllm/executor/ray_gpu_executor.py index c234a23f0a7..6ab77cc38e0 100644 --- a/vllm/executor/ray_gpu_executor.py +++ b/vllm/executor/ray_gpu_executor.py @@ -1,10 +1,11 @@ import asyncio import os +from array import array from collections import defaultdict from itertools import islice, repeat from typing import TYPE_CHECKING, Any, Dict, List, Optional, Tuple + import msgspec -from array import array import vllm.envs as envs from vllm.executor.distributed_gpu_executor import ( # yapf: disable @@ -36,7 +37,6 @@ def _init_executor(self) -> None: # Run vLLM with VLLM_USE_RAY_COMPILED_DAG=1 to enable it. # Currently, this requires USE_RAY_SPMD_WORKER=True. self.use_ray_compiled_dag = envs.VLLM_USE_RAY_COMPILED_DAG - self.i = 0 # If the env var is set, then we do not distinguish between the # "driver worker" vs other workers. Also, the rank 0 worker will # be executed in a remote Ray worker. Currently this requires @@ -307,27 +307,9 @@ def execute_model( if self.forward_dag is None: self.forward_dag = self._compiled_ray_dag(enable_asyncio=False) - # s = time.time() - # import pickle - # serialized_data = pickle.dumps(execute_model_req) - serialized_data = self.input_encoder.encode(execute_model_req) - # # Open a file in binary write mode - # import sys - # if sys.getsizeof(serialized_data) > 60000: - # with open('example.bin', 'wb') as file: - # # Write bytes to the file - # file.write(serialized_data) - - # print("SANG-TODO input serialization takes " - # f"{(time.time() - s) * 1000} ms index: {self.i}") - outputs = ray.get(self.forward_dag.execute(serialized_data)) - # output = pickle.loads(outputs[0]) output = self.output_decoder.decode(outputs[0]) - # print(f"SANG-TODO e2e takes {(time.time() - s) * 1000} " - # f"ms index: {self.i}") - self.i += 1 return output def _run_workers( diff --git a/vllm/executor/ray_utils.py b/vllm/executor/ray_utils.py index 5a7ccee13c4..83849b5e077 100644 --- a/vllm/executor/ray_utils.py +++ b/vllm/executor/ray_utils.py @@ -1,6 +1,7 @@ -from typing import List, Optional, Tuple, Type, Any, Union -import msgspec from array import array +from typing import Any, List, Optional, Tuple, Type, Union + +import msgspec from vllm.config import ParallelConfig from vllm.logger import init_logger @@ -24,7 +25,6 @@ def __init__(self, *args, **kwargs) -> None: # The flag indicates is set_device is called on # that thread. self.compiled_dag_cuda_device_set = False - self.i = 0 def dec_hook(type: Type, obj: Any) -> Any: if type is array: @@ -50,31 +50,26 @@ def get_node_and_gpu_ids(self) -> Tuple[str, List[int]]: return node_id, gpu_ids def execute_model_spmd( - self, req_or_tuple: Union[bytes, Tuple[ - bytes, Optional[IntermediateTensors]]] + self, req_or_tuple: Union[bytes, + Tuple[bytes, + Optional[IntermediateTensors]]] ) -> bytes: """Execute model in SPMD fashion: used only when SPMD worker and compiled DAG are both enabled. Args: - req_or_tuple: A requset or a tuple containing the + req_or_tuple: A request or a tuple containing the request and intermediate tensors. Intermediate tensors are None unless if it is provided because it is > 0 pipeline stage. The request is serialized by msgspec. """ - # s = time.time() if isinstance(req_or_tuple, bytes): serialized_data, intermediate_tensors = req_or_tuple, None else: serialized_data, intermediate_tensors = req_or_tuple - execute_model_req = self.input_decoder.decode( - serialized_data) - - # import pickle - # execute_model_req: ExecuteModelRequest = ( - # pickle.loads(execute_model_req)) - # print("SANG-TODO input deserialization takes " - # f"{(time.time() - s) * 1000} ms index: {self.i}") + + execute_model_req = self.input_decoder.decode(serialized_data) + # TODO(swang): This is needed right now because Ray aDAG executes # on a background thread, so we need to reset torch's current # device. @@ -91,10 +86,6 @@ def execute_model_spmd( else: output = self.output_encoder.encode(output) - # output = pickle.dumps(output) - # print("SANG-TODO worker takes " - # f"{(time.time() - s) * 1000} ms index: {self.i}") - self.i += 1 return output ray_import_err = None diff --git a/vllm/inputs/registry.py b/vllm/inputs/registry.py index 287cbe336fa..5b3cb28f976 100644 --- a/vllm/inputs/registry.py +++ b/vllm/inputs/registry.py @@ -1,5 +1,5 @@ -from array import array import functools +from array import array from dataclasses import dataclass from typing import (TYPE_CHECKING, Callable, Dict, Optional, Tuple, Type, TypeVar) diff --git a/vllm/lora/request.py b/vllm/lora/request.py index 31d52d8a83b..fbbbec17c95 100644 --- a/vllm/lora/request.py +++ b/vllm/lora/request.py @@ -1,9 +1,10 @@ import warnings from typing import Optional -from vllm.adapter_commons.request import AdapterRequest import msgspec +from vllm.adapter_commons.request import AdapterRequest + class LoRARequest(msgspec.Struct, omit_defaults=True, array_like=True): """ diff --git a/vllm/model_executor/models/blip.py b/vllm/model_executor/models/blip.py index df750d8dc73..9207a7ffa8c 100644 --- a/vllm/model_executor/models/blip.py +++ b/vllm/model_executor/models/blip.py @@ -1,7 +1,8 @@ """Minimal implementation of BlipVisionModel intended to be only used within a vision language model.""" -from typing import Optional, Union from array import array +from typing import Optional, Union + import torch import torch.nn as nn from PIL import Image diff --git a/vllm/model_executor/models/blip2.py b/vllm/model_executor/models/blip2.py index 5c0310a7d6d..4a58d8af589 100644 --- a/vllm/model_executor/models/blip2.py +++ b/vllm/model_executor/models/blip2.py @@ -1,5 +1,6 @@ -from typing import Iterable, List, Literal, Optional, Tuple, TypedDict from array import array +from typing import Iterable, List, Literal, Optional, Tuple, TypedDict + import torch import torch.nn as nn from transformers import (Blip2Config, Blip2QFormerConfig, Blip2VisionConfig, diff --git a/vllm/model_executor/models/chameleon.py b/vllm/model_executor/models/chameleon.py index b0978f22c51..dded95b669d 100644 --- a/vllm/model_executor/models/chameleon.py +++ b/vllm/model_executor/models/chameleon.py @@ -1,7 +1,8 @@ +from array import array from functools import cached_property from typing import (Any, Dict, Iterable, List, Literal, Optional, Tuple, TypedDict) -from array import array + import torch import torch.nn.functional as F from PIL import Image diff --git a/vllm/model_executor/models/clip.py b/vllm/model_executor/models/clip.py index fb4b546b3fa..acd1eb39004 100644 --- a/vllm/model_executor/models/clip.py +++ b/vllm/model_executor/models/clip.py @@ -1,7 +1,8 @@ """Minimal implementation of CLIPVisionModel intended to be only used within a vision language model.""" -from typing import Optional from array import array +from typing import Optional + import torch import torch.nn as nn from PIL import Image diff --git a/vllm/model_executor/models/fuyu.py b/vllm/model_executor/models/fuyu.py index 1a5da209bf4..cbe4001d054 100644 --- a/vllm/model_executor/models/fuyu.py +++ b/vllm/model_executor/models/fuyu.py @@ -16,8 +16,9 @@ # limitations under the License. """ PyTorch Fuyu model.""" import math -from typing import Iterable, List, Literal, Optional, Tuple, TypedDict from array import array +from typing import Iterable, List, Literal, Optional, Tuple, TypedDict + import torch import torch.nn as nn import torch.utils.checkpoint diff --git a/vllm/model_executor/models/minicpmv.py b/vllm/model_executor/models/minicpmv.py index 499583bee8e..7d95f53b829 100644 --- a/vllm/model_executor/models/minicpmv.py +++ b/vllm/model_executor/models/minicpmv.py @@ -23,9 +23,9 @@ """Inference-only MiniCPM-V model compatible with HuggingFace weights.""" import math import re +from array import array from functools import partial from typing import Dict, Iterable, List, Optional, Tuple, Union -from array import array import numpy as np import torch diff --git a/vllm/model_executor/models/paligemma.py b/vllm/model_executor/models/paligemma.py index b851c125067..17b80c81fe3 100644 --- a/vllm/model_executor/models/paligemma.py +++ b/vllm/model_executor/models/paligemma.py @@ -1,5 +1,6 @@ -from typing import Iterable, List, Literal, Optional, Tuple, TypedDict from array import array +from typing import Iterable, List, Literal, Optional, Tuple, TypedDict + import torch from PIL import Image from torch import nn diff --git a/vllm/multimodal/image.py b/vllm/multimodal/image.py index 02e878f8fe5..dc0a57d85e4 100644 --- a/vllm/multimodal/image.py +++ b/vllm/multimodal/image.py @@ -1,5 +1,5 @@ from functools import lru_cache -from typing import List, Optional, Tuple, TypeVar, Any +from typing import Any, List, Optional, Tuple, TypeVar import torch from PIL import Image diff --git a/vllm/multimodal/registry.py b/vllm/multimodal/registry.py index 8d531fb4be2..798ba2982dc 100644 --- a/vllm/multimodal/registry.py +++ b/vllm/multimodal/registry.py @@ -1,5 +1,5 @@ import functools -from typing import Dict, Optional, Sequence, Any +from typing import Any, Dict, Optional, Sequence import torch diff --git a/vllm/pooling_params.py b/vllm/pooling_params.py index 650b815ba51..204adac889b 100644 --- a/vllm/pooling_params.py +++ b/vllm/pooling_params.py @@ -1,4 +1,5 @@ from typing import Any, Optional + import msgspec diff --git a/vllm/prompt_adapter/request.py b/vllm/prompt_adapter/request.py index 9fb7932b15b..499dd5114d5 100644 --- a/vllm/prompt_adapter/request.py +++ b/vllm/prompt_adapter/request.py @@ -1,6 +1,7 @@ -from vllm.adapter_commons.request import AdapterRequest import msgspec +from vllm.adapter_commons.request import AdapterRequest + class PromptAdapterRequest(msgspec.Struct, array_like=True, omit_defaults=True): diff --git a/vllm/sampling_params.py b/vllm/sampling_params.py index 5f0a7fa4515..5c0d7946604 100644 --- a/vllm/sampling_params.py +++ b/vllm/sampling_params.py @@ -1,11 +1,11 @@ """Sampling parameters for text generation.""" import copy from enum import IntEnum -from typing import Any, Callable, Dict, List, Optional, Union, Set +from typing import Any, Callable, Dict, List, Optional, Set, Union +import msgspec import torch from typing_extensions import Annotated -import msgspec from vllm.logger import init_logger diff --git a/vllm/sequence.py b/vllm/sequence.py index 26e8f4910d3..c544ebae7e0 100644 --- a/vllm/sequence.py +++ b/vllm/sequence.py @@ -6,17 +6,17 @@ from array import array from collections import defaultdict from dataclasses import dataclass -from typing import (TYPE_CHECKING, Dict, List, Mapping, Optional, Set, Tuple, - Union, Any) +from typing import (TYPE_CHECKING, Any, Dict, List, Mapping, Optional, Set, + Tuple, Union) +import msgspec import torch -import msgspec from vllm.lora.request import LoRARequest +from vllm.multimodal.base import MultiModalDataDict from vllm.pooling_params import PoolingParams from vllm.prompt_adapter.request import PromptAdapterRequest from vllm.sampling_params import SamplingParams -from vllm.multimodal.base import MultiModalDataDict from vllm.spec_decode.metrics import SpecDecodeWorkerMetrics if TYPE_CHECKING: @@ -132,10 +132,10 @@ class SequenceData(msgspec.Struct, omit_defaults=True): # The number of tokens that are computed (that run against the model). _num_computed_tokens: int = 0 _stage: SequenceStage = SequenceStage.PREFILL + _cached_all_token_ids: List[int] = msgspec.field(default_factory=list) - # Used to get delta input. + # Below fields are used to get delta input. _new_appended_tokens: List[int] = msgspec.field(default_factory=list) - _cached_all_token_ids: List[int] = msgspec.field(default_factory=list) def __post_init__(self, ) -> None: self._prompt_token_ids_tuple: Tuple[int, ...] = tuple( diff --git a/vllm/spec_decode/batch_expansion.py b/vllm/spec_decode/batch_expansion.py index 5aed0ab992f..b66d8fec9fb 100644 --- a/vllm/spec_decode/batch_expansion.py +++ b/vllm/spec_decode/batch_expansion.py @@ -1,6 +1,7 @@ +from array import array from itertools import chain, count from typing import Iterator, List, Tuple -from array import array + import torch from vllm import SamplingParams diff --git a/vllm/spec_decode/metrics.py b/vllm/spec_decode/metrics.py index 1029c855705..b959bb9fd09 100644 --- a/vllm/spec_decode/metrics.py +++ b/vllm/spec_decode/metrics.py @@ -1,7 +1,7 @@ import time from typing import Callable, Optional -import msgspec +import msgspec import torch from vllm.model_executor.layers.spec_decode_base_sampler import ( diff --git a/vllm/worker/worker.py b/vllm/worker/worker.py index 0c3dfd29dca..3f49aeb8817 100644 --- a/vllm/worker/worker.py +++ b/vllm/worker/worker.py @@ -18,9 +18,9 @@ from vllm.model_executor.model_loader.tensorizer import TensorizerConfig from vllm.platforms import current_platform from vllm.prompt_adapter.request import PromptAdapterRequest -from vllm.sequence import (ExecuteModelRequest, SamplerOutput, - SequenceGroupMetadata, SequenceGroupMetadataDelta, - IntermediateTensors) +from vllm.sequence import (ExecuteModelRequest, IntermediateTensors, + SamplerOutput, SequenceGroupMetadata, + SequenceGroupMetadataDelta) from vllm.worker.cache_engine import CacheEngine from vllm.worker.embedding_model_runner import EmbeddingModelRunner from vllm.worker.model_runner import GPUModelRunnerBase, ModelRunner @@ -319,13 +319,6 @@ def _get_cached_seq_group_metadata(self, seq_group_metadata_list): self._seq_group_metadata_cache[request_id]) return new_seq_group_metadata_list - def execute_model( - self, - execute_model_req: Optional[ExecuteModelRequest] = None - ) -> Optional[List[SamplerOutput]]: - output = super().execute_model(execute_model_req) - return output - def _execute_model_spmd( self, execute_model_req: ExecuteModelRequest, From 007fe8607a58470593d0b69557109d53940d88e7 Mon Sep 17 00:00:00 2001 From: sang Date: Mon, 5 Aug 2024 12:56:59 -0700 Subject: [PATCH 17/36] fix a test failure. --- vllm/lora/request.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/vllm/lora/request.py b/vllm/lora/request.py index fbbbec17c95..094e4b88f7c 100644 --- a/vllm/lora/request.py +++ b/vllm/lora/request.py @@ -28,7 +28,7 @@ class LoRARequest(msgspec.Struct, omit_defaults=True, array_like=True): __hash__ = AdapterRequest.__hash__ def __post_init__(self): - if 'lora_local_path' in self.__dict__: + if 'lora_local_path' in self.__struct_fields__: warnings.warn( "The 'lora_local_path' attribute is deprecated " "and will be removed in a future version. " From ce64b8de8ee7acbd2884d348d706231b379d924e Mon Sep 17 00:00:00 2001 From: sang Date: Wed, 7 Aug 2024 10:55:48 -0700 Subject: [PATCH 18/36] . --- tests/distributed/test_pipeline_parallel.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/tests/distributed/test_pipeline_parallel.py b/tests/distributed/test_pipeline_parallel.py index 431ce3ef05c..6b31a48198d 100644 --- a/tests/distributed/test_pipeline_parallel.py +++ b/tests/distributed/test_pipeline_parallel.py @@ -28,11 +28,11 @@ # (1, 3, 0, 0, "meta-llama/Meta-Llama-3-8B", "ray", True, False), # (1, 4, 0, 1, "meta-llama/Meta-Llama-3-8B", "ray", True, False), # (1, 4, 1, 0, "meta-llama/Meta-Llama-3-8B", "ray", True, False), - # (2, 2, 0, 1, "meta-llama/Meta-Llama-3-8B", "ray", True, True), + (2, 2, 0, 1, "meta-llama/Meta-Llama-3-8B", "ray", True, True), # (2, 2, 1, 0, "meta-llama/Meta-Llama-3-8B", "ray", True, True), # (1, 3, 0, 0, "meta-llama/Meta-Llama-3-8B", "ray", True, True), - # (1, 4, 0, 1, "meta-llama/Meta-Llama-3-8B", "ray", True, True), - (1, 2, 1, 0, "meta-llama/Meta-Llama-3-8B", "ray", True, True), + (1, 4, 0, 1, "meta-llama/Meta-Llama-3-8B", "ray", True, True), + # (1, 2, 1, 0, "meta-llama/Meta-Llama-3-8B", "ray", True, True), # (2, 2, 0, 1, "meta-llama/Meta-Llama-3-8B", "mp", False, False), # (2, 2, 1, 0, "meta-llama/Meta-Llama-3-8B", "mp", False, False), # (1, 3, 0, 0, "meta-llama/Meta-Llama-3-8B", "mp", False, False), From e8e29e1805a7ea65eeb04b574cb5edbcdbab3221 Mon Sep 17 00:00:00 2001 From: sang Date: Fri, 9 Aug 2024 17:28:39 -0700 Subject: [PATCH 19/36] fixed --- tests/distributed/test_pipeline_parallel.py | 31 ++++++++----------- tests/prompts/example.txt | 3 +- vllm/config.py | 5 +++- vllm/core/scheduler.py | 2 +- vllm/executor/msgspec_utils.py | 25 ++++++++++++++++ vllm/executor/ray_gpu_executor.py | 8 ++--- vllm/executor/ray_utils.py | 22 ++++---------- vllm/inputs/registry.py | 2 +- vllm/model_executor/models/blip.py | 6 ++-- vllm/model_executor/models/blip2.py | 6 ++-- vllm/model_executor/models/chameleon.py | 6 ++-- vllm/model_executor/models/clip.py | 6 ++-- vllm/model_executor/models/fuyu.py | 6 ++-- vllm/model_executor/models/minicpmv.py | 4 +-- vllm/model_executor/models/paligemma.py | 6 ++-- vllm/sequence.py | 18 +++++++++-- vllm/spec_decode/batch_expansion.py | 4 +-- vllm/worker/worker.py | 33 ++++++++++++++++----- 18 files changed, 117 insertions(+), 76 deletions(-) create mode 100644 vllm/executor/msgspec_utils.py diff --git a/tests/distributed/test_pipeline_parallel.py b/tests/distributed/test_pipeline_parallel.py index 6b31a48198d..64847b16e26 100644 --- a/tests/distributed/test_pipeline_parallel.py +++ b/tests/distributed/test_pipeline_parallel.py @@ -18,26 +18,21 @@ ("TP_SIZE, PP_SIZE, EAGER_MODE, CHUNKED_PREFILL, " "MODEL_NAME, DIST_BACKEND, USE_RAY_ADAG, USE_RAY_ADAG_NCCL"), [ - # (2, 2, 0, 1, "meta-llama/Meta-Llama-3-8B", "ray", False, False), - # (2, 2, 1, 0, "meta-llama/Meta-Llama-3-8B", "ray", False, False), - # (1, 3, 0, 0, "meta-llama/Meta-Llama-3-8B", "ray", False, False), - # (1, 4, 0, 1, "meta-llama/Meta-Llama-3-8B", "ray", False, False), - # (1, 4, 1, 0, "meta-llama/Meta-Llama-3-8B", "ray", False, False), - # (2, 2, 0, 1, "meta-llama/Meta-Llama-3-8B", "ray", True, False), - # (2, 2, 1, 0, "meta-llama/Meta-Llama-3-8B", "ray", True, False), - # (1, 3, 0, 0, "meta-llama/Meta-Llama-3-8B", "ray", True, False), - # (1, 4, 0, 1, "meta-llama/Meta-Llama-3-8B", "ray", True, False), - # (1, 4, 1, 0, "meta-llama/Meta-Llama-3-8B", "ray", True, False), + (2, 2, 0, 1, "meta-llama/Meta-Llama-3-8B", "ray", False, False), + (2, 2, 1, 0, "meta-llama/Meta-Llama-3-8B", "ray", False, False), + (1, 3, 0, 0, "meta-llama/Meta-Llama-3-8B", "ray", False, False), + (1, 4, 0, 1, "meta-llama/Meta-Llama-3-8B", "ray", False, False), + (1, 4, 1, 0, "meta-llama/Meta-Llama-3-8B", "ray", False, False), (2, 2, 0, 1, "meta-llama/Meta-Llama-3-8B", "ray", True, True), - # (2, 2, 1, 0, "meta-llama/Meta-Llama-3-8B", "ray", True, True), - # (1, 3, 0, 0, "meta-llama/Meta-Llama-3-8B", "ray", True, True), + (2, 2, 1, 0, "meta-llama/Meta-Llama-3-8B", "ray", True, True), + (1, 3, 0, 0, "meta-llama/Meta-Llama-3-8B", "ray", True, True), (1, 4, 0, 1, "meta-llama/Meta-Llama-3-8B", "ray", True, True), - # (1, 2, 1, 0, "meta-llama/Meta-Llama-3-8B", "ray", True, True), - # (2, 2, 0, 1, "meta-llama/Meta-Llama-3-8B", "mp", False, False), - # (2, 2, 1, 0, "meta-llama/Meta-Llama-3-8B", "mp", False, False), - # (1, 3, 0, 0, "meta-llama/Meta-Llama-3-8B", "mp", False, False), - # (1, 4, 0, 1, "meta-llama/Meta-Llama-3-8B", "mp", False, False), - # (1, 4, 1, 0, "meta-llama/Meta-Llama-3-8B", "mp", False, False), + (1, 2, 1, 0, "meta-llama/Meta-Llama-3-8B", "ray", True, True), + (2, 2, 0, 1, "meta-llama/Meta-Llama-3-8B", "mp", False, False), + (2, 2, 1, 0, "meta-llama/Meta-Llama-3-8B", "mp", False, False), + (1, 3, 0, 0, "meta-llama/Meta-Llama-3-8B", "mp", False, False), + (1, 4, 0, 1, "meta-llama/Meta-Llama-3-8B", "mp", False, False), + (1, 4, 1, 0, "meta-llama/Meta-Llama-3-8B", "mp", False, False), ]) def test_compare_tp(TP_SIZE, PP_SIZE, EAGER_MODE, CHUNKED_PREFILL, MODEL_NAME, DIST_BACKEND, USE_RAY_ADAG, USE_RAY_ADAG_NCCL): diff --git a/tests/prompts/example.txt b/tests/prompts/example.txt index c88e90a0ff6..cef4d1d7687 100644 --- a/tests/prompts/example.txt +++ b/tests/prompts/example.txt @@ -5,5 +5,4 @@ Describe the basic components of a neural network and how it can be trained. Write a short story about a robot that dreams for the first time. Analyze the impact of the COVID-19 pandemic on global economic structures and future business models. Explain the cultural significance of the Mona Lisa painting, and how its perception might vary in Western versus Eastern societies. -Translate the following English sentence into Japanese, French, and Swahili: 'The early bird catches the worm.' -vLLM is a high-throughput and memory-efficient inference and serving engine for LLMs. \ No newline at end of file +Translate the following English sentence into Japanese, French, and Swahili: 'The early bird catches the worm.' \ No newline at end of file diff --git a/vllm/config.py b/vllm/config.py index 6a3efc736d5..5c05f5d926b 100644 --- a/vllm/config.py +++ b/vllm/config.py @@ -789,7 +789,10 @@ class SchedulerConfig: (e.g., beam search), recomputation is not currently supported. In such a case, we use swapping instead. _use_delta: Private API. If used, scheduler sends delta data to - workers instead of an entire data. + workers instead of an entire data. It should be enabled only + when SPMD worker architecture is enabled. I.e., + VLLM_USE_RAY_SPMD_WORKER=1 + """ def __init__(self, diff --git a/vllm/core/scheduler.py b/vllm/core/scheduler.py index 88ceb323bf7..71e9e0bd491 100644 --- a/vllm/core/scheduler.py +++ b/vllm/core/scheduler.py @@ -1021,7 +1021,7 @@ def schedule(self) -> Tuple[List[SequenceGroupMetadata], SchedulerOutputs]: # the first request to reduce serialization cost. seq_data_delta = {} for id, data in seq_data.items(): - seq_data_delta[id] = data.get_delta() + seq_data_delta[id] = data.reset_and_get_delta() seq_group_metadata = SequenceGroupMetadataDelta( seq_data_delta, seq_group.request_id, diff --git a/vllm/executor/msgspec_utils.py b/vllm/executor/msgspec_utils.py new file mode 100644 index 00000000000..c47ec59555d --- /dev/null +++ b/vllm/executor/msgspec_utils.py @@ -0,0 +1,25 @@ +from typing import Any, Type +from array import array + + +def encode_hook(obj: Any) -> Any: + """Custom msgspec enc hook that supports array types. + + See https://jcristharif.com/msgspec/api.html#msgspec.msgpack.Encoder + """ + if isinstance(obj, array): + return obj.tobytes() + else: + raise ValueError(f"Unsupported serialization type: {type(obj)}") + +def decode_hook(type: Type, obj: Any) -> Any: + """Custom msgspec dec hook that supports array types. + + See https://jcristharif.com/msgspec/api.html#msgspec.msgpack.Encoder + """ + if type is array: + deserialized = array('I') + deserialized.frombytes(obj) + return deserialized + else: + raise ValueError(f"Unsupported deserialization type: {type(obj)}") diff --git a/vllm/executor/ray_gpu_executor.py b/vllm/executor/ray_gpu_executor.py index 6ab77cc38e0..0d8df875174 100644 --- a/vllm/executor/ray_gpu_executor.py +++ b/vllm/executor/ray_gpu_executor.py @@ -11,6 +11,7 @@ from vllm.executor.distributed_gpu_executor import ( # yapf: disable DistributedGPUExecutor, DistributedGPUExecutorAsync) from vllm.executor.ray_utils import RayWorkerWrapper, ray +from vllm.executor.msgspec_utils import encode_hook from vllm.logger import init_logger from vllm.sequence import ExecuteModelRequest, SamplerOutput from vllm.utils import (_run_task_with_lock, get_distributed_init_method, @@ -63,12 +64,7 @@ def _init_executor(self) -> None: # Create the parallel GPU workers. self._init_workers_ray(placement_group) - def enc_hook(obj: Any) -> Any: - if isinstance(obj, array): - # convert the complex to a tuple of real, imag - return obj.tobytes() - - self.input_encoder = msgspec.msgpack.Encoder(enc_hook=enc_hook) + self.input_encoder = msgspec.msgpack.Encoder(enc_hook=encode_hook) self.output_decoder = msgspec.msgpack.Decoder( Optional[List[SamplerOutput]]) diff --git a/vllm/executor/ray_utils.py b/vllm/executor/ray_utils.py index 83849b5e077..6670d1c1e32 100644 --- a/vllm/executor/ray_utils.py +++ b/vllm/executor/ray_utils.py @@ -5,6 +5,7 @@ from vllm.config import ParallelConfig from vllm.logger import init_logger +from vllm.executor.msgspec_utils import encode_hook, decode_hook from vllm.sequence import ExecuteModelRequest, IntermediateTensors from vllm.utils import get_ip, is_hip, is_tpu, is_xpu from vllm.worker.worker_base import WorkerWrapperBase @@ -26,20 +27,9 @@ def __init__(self, *args, **kwargs) -> None: # that thread. self.compiled_dag_cuda_device_set = False - def dec_hook(type: Type, obj: Any) -> Any: - if type is array: - deserialized = array('I') - deserialized.frombytes(obj) - return deserialized - - def enc_hook(obj: Any) -> Any: - if isinstance(obj, array): - # convert the complex to a tuple of real, imag - return obj.tobytes() - self.input_decoder = msgspec.msgpack.Decoder(ExecuteModelRequest, - dec_hook=dec_hook) - self.output_encoder = msgspec.msgpack.Encoder(enc_hook=enc_hook) + dec_hook=decode_hook) + self.output_encoder = msgspec.msgpack.Encoder(enc_hook=encode_hook) def get_node_ip(self) -> str: return get_ip() @@ -64,11 +54,11 @@ def execute_model_spmd( stage. The request is serialized by msgspec. """ if isinstance(req_or_tuple, bytes): - serialized_data, intermediate_tensors = req_or_tuple, None + serialized_req, intermediate_tensors = req_or_tuple, None else: - serialized_data, intermediate_tensors = req_or_tuple + serialized_req, intermediate_tensors = req_or_tuple - execute_model_req = self.input_decoder.decode(serialized_data) + execute_model_req = self.input_decoder.decode(serialized_req) # TODO(swang): This is needed right now because Ray aDAG executes # on a background thread, so we need to reset torch's current diff --git a/vllm/inputs/registry.py b/vllm/inputs/registry.py index 5b3cb28f976..57891ed26af 100644 --- a/vllm/inputs/registry.py +++ b/vllm/inputs/registry.py @@ -107,7 +107,7 @@ def _default_dummy_data_factory( # Avoid circular import from vllm.sequence import SequenceData - dummy_seq_data = SequenceData(array("I", [0] * seq_len)) + dummy_seq_data = SequenceData(array("I", [0]) * seq_len) dummy_multi_modal_data = None return dummy_seq_data, dummy_multi_modal_data diff --git a/vllm/model_executor/models/blip.py b/vllm/model_executor/models/blip.py index 9207a7ffa8c..4bc39a43c8c 100644 --- a/vllm/model_executor/models/blip.py +++ b/vllm/model_executor/models/blip.py @@ -54,9 +54,9 @@ def dummy_seq_data_for_blip( else: image_feature_size = image_feature_size_override - token_ids = [image_token_id] * image_feature_size - token_ids += [0] * (seq_len - image_feature_size) - return SequenceData(array("I", token_ids)) + token_ids = array("I", [image_token_id]) * image_feature_size + token_ids += array("I", [0]) * (seq_len - image_feature_size) + return SequenceData(token_ids) def dummy_image_for_blip( diff --git a/vllm/model_executor/models/blip2.py b/vllm/model_executor/models/blip2.py index 4a58d8af589..ad32b96cb13 100644 --- a/vllm/model_executor/models/blip2.py +++ b/vllm/model_executor/models/blip2.py @@ -410,9 +410,9 @@ def dummy_data_for_blip2(ctx: InputContext, seq_len: int): vision_config = hf_config.vision_config image_feature_size = get_blip2_image_feature_size(hf_config) - token_ids = [BLIP2_IMAGE_TOKEN_ID] * image_feature_size - token_ids += [0] * (seq_len - image_feature_size) - seq_data = SequenceData(array("I", token_ids)) + token_ids = array("I", [BLIP2_IMAGE_TOKEN_ID]) * image_feature_size + token_ids += array("I", [0]) * (seq_len - image_feature_size) + seq_data = SequenceData(token_ids) if isinstance(vision_config, Blip2VisionConfig): mm_data = dummy_image_for_blip(vision_config) diff --git a/vllm/model_executor/models/chameleon.py b/vllm/model_executor/models/chameleon.py index dded95b669d..7a1e4bcd6d9 100644 --- a/vllm/model_executor/models/chameleon.py +++ b/vllm/model_executor/models/chameleon.py @@ -69,9 +69,9 @@ def dummy_seq_data_for_chameleon( else: image_feature_size = image_feature_size_override - token_ids = [image_token_id] * image_feature_size - token_ids += [0] * (seq_len - image_feature_size) - return SequenceData(array("I", token_ids)) + token_ids = array("I", [image_token_id]) * image_feature_size + token_ids += array("I", [0]) * (seq_len - image_feature_size) + return SequenceData(token_ids) def dummy_image_for_chameleon( diff --git a/vllm/model_executor/models/clip.py b/vllm/model_executor/models/clip.py index acd1eb39004..d073d9d3b5e 100644 --- a/vllm/model_executor/models/clip.py +++ b/vllm/model_executor/models/clip.py @@ -52,9 +52,9 @@ def dummy_seq_data_for_clip( else: image_feature_size = image_feature_size_override - token_ids = [image_token_id] * image_feature_size - token_ids += [0] * (seq_len - image_feature_size) - return SequenceData(array("I", token_ids)) + token_ids = array("I", [image_token_id]) * image_feature_size + token_ids += array("I", [0]) * (seq_len - image_feature_size) + return SequenceData(token_ids) def dummy_image_for_clip( diff --git a/vllm/model_executor/models/fuyu.py b/vllm/model_executor/models/fuyu.py index cbe4001d054..6ac61c494f6 100644 --- a/vllm/model_executor/models/fuyu.py +++ b/vllm/model_executor/models/fuyu.py @@ -99,9 +99,9 @@ def dummy_seq_data_for_fuyu(ctx: InputContext, seq_len: int): ncol, nrow = get_max_fuyu_image_feature_size() image_feature_size = get_max_fuyu_image_tokens(ctx) - token_ids = ([_IMAGE_TOKEN_ID] * ncol + [_NEWLINE_TOKEN_ID]) * nrow - token_ids += [0] * (seq_len - image_feature_size) - return SequenceData(array("I", token_ids)) + token_ids = (array("I", [_IMAGE_TOKEN_ID]) * ncol + array("I", [_NEWLINE_TOKEN_ID])) * nrow + token_ids += array("I", [0]) * (seq_len - image_feature_size) + return SequenceData(token_ids) def dummy_image_for_fuyu( diff --git a/vllm/model_executor/models/minicpmv.py b/vllm/model_executor/models/minicpmv.py index 7d95f53b829..22ba2c1457c 100644 --- a/vllm/model_executor/models/minicpmv.py +++ b/vllm/model_executor/models/minicpmv.py @@ -336,8 +336,8 @@ def get_max_minicpmv_image_tokens(ctx: InputContext): def dummy_seq_data_for_minicpmv(seq_len: int): - token_ids = [0] * seq_len - return SequenceData(array("I", token_ids)) + token_ids = array("I", [0]) * seq_len + return SequenceData(token_ids) def dummy_image_for_minicpmv(hf_config: PretrainedConfig): diff --git a/vllm/model_executor/models/paligemma.py b/vllm/model_executor/models/paligemma.py index 17b80c81fe3..ac0093417b1 100644 --- a/vllm/model_executor/models/paligemma.py +++ b/vllm/model_executor/models/paligemma.py @@ -50,9 +50,9 @@ def dummy_seq_data_for_paligemma( else: image_feature_size = image_feature_size_override - token_ids = [image_token_id] * image_feature_size - token_ids += [0] * (seq_len - image_feature_size) - return SequenceData(array("I", token_ids)) + token_ids = array("I", [image_token_id]) * image_feature_size + token_ids += array("I", [0]) * (seq_len - image_feature_size) + return SequenceData(token_ids) def dummy_image_for_paligemma( diff --git a/vllm/sequence.py b/vllm/sequence.py index c544ebae7e0..ab50712dc75 100644 --- a/vllm/sequence.py +++ b/vllm/sequence.py @@ -162,6 +162,11 @@ def prompt_token_ids(self, new_prompt_token_ids) -> None: @property def prompt_token_ids_array(self) -> array: + """Return the prompt token ids in array type. + + Note that the array is in "I" type, and it is not compatible + with torch.long (2 bytes vs 4 bytes). So beware of the usage. + """ return self._prompt_token_ids @property @@ -175,6 +180,11 @@ def output_token_ids(self, new_output_token_ids: List[int]) -> None: @property def output_token_ids_array(self) -> array: + """Return the prompt token ids in array type. + + Note that the array is in "I" type, and it is not compatible + with torch.long (2 bytes vs 4 bytes). So beware of the usage. + """ assert isinstance(self._output_token_ids, array) return self._output_token_ids @@ -247,7 +257,7 @@ def get_prompt_token_ids(self) -> Tuple[int, ...]: def get_output_token_ids(self) -> Tuple[int, ...]: return self.output_token_ids - def get_delta(self) -> SequenceDataDelta: + def reset_and_get_delta(self) -> SequenceDataDelta: delta = SequenceDataDelta(self._new_appended_tokens, self._cumulative_logprob, self.get_num_computed_tokens(), self.stage) @@ -663,7 +673,11 @@ class SequenceGroupMetadataDelta(msgspec.Struct, tag=True, array_like=True, omit_defaults=True): - """Delta sequence group metadata.""" + """Delta of SequenceGroupMetadata. + + After sending the first SequenceGroupMetadata, vLLM scheduler + only sends delta to reduce the data payload size. + """ seq_data_delta: Dict[int, SequenceDataDelta] request_id: str block_tables: Dict[int, List[int]] diff --git a/vllm/spec_decode/batch_expansion.py b/vllm/spec_decode/batch_expansion.py index b66d8fec9fb..f0c26d36633 100644 --- a/vllm/spec_decode/batch_expansion.py +++ b/vllm/spec_decode/batch_expansion.py @@ -294,13 +294,13 @@ def _create_single_target_seq_group_metadata( input sequence. """ seq_data = seq_group_metadata.seq_data[seq_id] - prompt_token_ids = seq_data.get_prompt_token_ids() + prompt_token_ids = seq_data.prompt_token_ids_array new_output_token_ids = [*seq_data.get_output_token_ids(), *token_ids] new_seq_data_dict = { target_seq_id: SequenceData( - array("I", prompt_token_ids), + prompt_token_ids, _output_token_ids=array("I", new_output_token_ids), ), } diff --git a/vllm/worker/worker.py b/vllm/worker/worker.py index 3f49aeb8817..07d2161941c 100644 --- a/vllm/worker/worker.py +++ b/vllm/worker/worker.py @@ -1,7 +1,7 @@ """A GPU worker class.""" import gc import os -from typing import Dict, List, Optional, Set, Tuple, Type +from typing import Dict, List, Optional, Set, Tuple, Type, Union import torch import torch.distributed @@ -33,8 +33,6 @@ class Worker(LocalOrDistributedWorkerBase): Each worker is associated with a single GPU. The worker is responsible for maintaining the KV cache and executing the model on the GPU. In case of distributed inference, each worker is assigned a partition of the model. - - The worker manages the state of SequenceGroupMetadata. """ def __init__( @@ -297,26 +295,45 @@ def execute_worker(self, worker_input: WorkerInput) -> None: and worker_input.blocks_to_copy.numel() > 0): self.cache_engine[virtual_engine].copy(worker_input.blocks_to_copy) - def _get_cached_seq_group_metadata(self, seq_group_metadata_list): - """In-place update execute_model_req based on a cached """ + def _get_cached_seq_group_metadata( + self, + seq_group_metadata_list: List[Union[SequenceGroupMetadata, + SequenceGroupMetadataDelta]], + finished_request_ids: List[str]) -> List[SequenceGroupMetadata]: + """Return a list of cached Sequence Group Metadata after updating its + state. + + It is used because scheduler only sends delta to workers to reduce + the data payload size. The function also cleans up cache based on + a given `finished_request_ids`. + """ new_seq_group_metadata_list = [] for metadata_or_delta in seq_group_metadata_list: request_id = metadata_or_delta.request_id if request_id not in self._seq_group_metadata_cache: + # The first prefill. assert isinstance(metadata_or_delta, SequenceGroupMetadata) self._seq_group_metadata_cache[request_id] = metadata_or_delta else: + # The first prefill is already cached. if isinstance(metadata_or_delta, SequenceGroupMetadataDelta): self._seq_group_metadata_cache[request_id].apply_delta( metadata_or_delta) else: # If metadata snapshot is sent again, it is either - # preempted, or chunked prefill. Reset the cache. + # preempted. Reset the cache because we need to start + # from scratch. assert isinstance(metadata_or_delta, SequenceGroupMetadata) self._seq_group_metadata_cache[ request_id] = metadata_or_delta + new_seq_group_metadata_list.append( self._seq_group_metadata_cache[request_id]) + + # Clean up finished ids + for finished_id in finished_request_ids: + del self._seq_group_metadata_cache[finished_id] + return new_seq_group_metadata_list def _execute_model_spmd( @@ -326,7 +343,9 @@ def _execute_model_spmd( ) -> Optional[List[SamplerOutput]]: if execute_model_req is not None: new_seq_group_metadata_list = self._get_cached_seq_group_metadata( - execute_model_req.seq_group_metadata_list) + execute_model_req.seq_group_metadata_list, + execute_model_req.finished_requests_ids) + execute_model_req.seq_group_metadata_list = ( new_seq_group_metadata_list) output = super()._execute_model_spmd(execute_model_req, From 751bdb1cf2740ca36c9183074141851814bef4f9 Mon Sep 17 00:00:00 2001 From: sang Date: Sun, 11 Aug 2024 21:03:22 -0700 Subject: [PATCH 20/36] addressed code review. --- tests/distributed/test_pipeline_parallel.py | 5 ++--- vllm/config.py | 6 +++--- vllm/core/scheduler.py | 2 +- vllm/engine/arg_utils.py | 4 ++-- vllm/executor/msgspec_utils.py | 1 + vllm/model_executor/models/fuyu.py | 3 ++- vllm/worker/worker.py | 2 +- 7 files changed, 12 insertions(+), 11 deletions(-) diff --git a/tests/distributed/test_pipeline_parallel.py b/tests/distributed/test_pipeline_parallel.py index 64847b16e26..9135419593e 100644 --- a/tests/distributed/test_pipeline_parallel.py +++ b/tests/distributed/test_pipeline_parallel.py @@ -16,8 +16,7 @@ @pytest.mark.parametrize( ("TP_SIZE, PP_SIZE, EAGER_MODE, CHUNKED_PREFILL, " - "MODEL_NAME, DIST_BACKEND, USE_RAY_ADAG, USE_RAY_ADAG_NCCL"), - [ + "MODEL_NAME, DIST_BACKEND, USE_RAY_ADAG, USE_RAY_ADAG_NCCL"), [ (2, 2, 0, 1, "meta-llama/Meta-Llama-3-8B", "ray", False, False), (2, 2, 1, 0, "meta-llama/Meta-Llama-3-8B", "ray", False, False), (1, 3, 0, 0, "meta-llama/Meta-Llama-3-8B", "ray", False, False), @@ -33,7 +32,7 @@ (1, 3, 0, 0, "meta-llama/Meta-Llama-3-8B", "mp", False, False), (1, 4, 0, 1, "meta-llama/Meta-Llama-3-8B", "mp", False, False), (1, 4, 1, 0, "meta-llama/Meta-Llama-3-8B", "mp", False, False), - ]) + ]) def test_compare_tp(TP_SIZE, PP_SIZE, EAGER_MODE, CHUNKED_PREFILL, MODEL_NAME, DIST_BACKEND, USE_RAY_ADAG, USE_RAY_ADAG_NCCL): if VLLM_MULTI_NODE and DIST_BACKEND == "mp": diff --git a/vllm/config.py b/vllm/config.py index 5c05f5d926b..eb78d2ac148 100644 --- a/vllm/config.py +++ b/vllm/config.py @@ -788,7 +788,7 @@ class SchedulerConfig: swapping. However, when the sequence group has multiple sequences (e.g., beam search), recomputation is not currently supported. In such a case, we use swapping instead. - _use_delta: Private API. If used, scheduler sends delta data to + _send_delta_data: Private API. If used, scheduler sends delta data to workers instead of an entire data. It should be enabled only when SPMD worker architecture is enabled. I.e., VLLM_USE_RAY_SPMD_WORKER=1 @@ -805,7 +805,7 @@ def __init__(self, enable_chunked_prefill: bool = False, embedding_mode: Optional[bool] = False, preemption_mode: Optional[str] = None, - _use_delta: bool = False) -> None: + _send_delta_data: bool = False) -> None: if max_num_batched_tokens is not None: self.max_num_batched_tokens = max_num_batched_tokens else: @@ -834,7 +834,7 @@ def __init__(self, self.chunked_prefill_enabled = enable_chunked_prefill self.embedding_mode = embedding_mode self.preemption_mode = preemption_mode - self._use_delta = _use_delta + self._send_delta_data = _send_delta_data self._verify_args() def _verify_args(self) -> None: diff --git a/vllm/core/scheduler.py b/vllm/core/scheduler.py index 71e9e0bd491..b8b5f8ee950 100644 --- a/vllm/core/scheduler.py +++ b/vllm/core/scheduler.py @@ -996,7 +996,7 @@ def schedule(self) -> Tuple[List[SequenceGroupMetadata], SchedulerOutputs]: # It assumes the scheduled_seq_groups is ordered by # prefill < decoding. - if is_first_prefill or not self.scheduler_config._use_delta: + if is_first_prefill or not self.scheduler_config._send_delta_data: seq_group_metadata = SequenceGroupMetadata( request_id=seq_group.request_id, is_prompt=is_prompt, diff --git a/vllm/engine/arg_utils.py b/vllm/engine/arg_utils.py index d7882c0dc67..ef2a33d35b0 100644 --- a/vllm/engine/arg_utils.py +++ b/vllm/engine/arg_utils.py @@ -816,8 +816,8 @@ def create_engine_config(self, ) -> EngineConfig: enable_chunked_prefill=self.enable_chunked_prefill, embedding_mode=model_config.embedding_mode, preemption_mode=self.preemption_mode, - _use_delta=(envs.VLLM_USE_RAY_SPMD_WORKER - and parallel_config.use_ray)) + _send_delta_data=(envs.VLLM_USE_RAY_SPMD_WORKER + and parallel_config.use_ray)) lora_config = LoRAConfig( max_lora_rank=self.max_lora_rank, max_loras=self.max_loras, diff --git a/vllm/executor/msgspec_utils.py b/vllm/executor/msgspec_utils.py index c47ec59555d..c403071698a 100644 --- a/vllm/executor/msgspec_utils.py +++ b/vllm/executor/msgspec_utils.py @@ -12,6 +12,7 @@ def encode_hook(obj: Any) -> Any: else: raise ValueError(f"Unsupported serialization type: {type(obj)}") + def decode_hook(type: Type, obj: Any) -> Any: """Custom msgspec dec hook that supports array types. diff --git a/vllm/model_executor/models/fuyu.py b/vllm/model_executor/models/fuyu.py index 6ac61c494f6..f9ab1c0a9d9 100644 --- a/vllm/model_executor/models/fuyu.py +++ b/vllm/model_executor/models/fuyu.py @@ -99,7 +99,8 @@ def dummy_seq_data_for_fuyu(ctx: InputContext, seq_len: int): ncol, nrow = get_max_fuyu_image_feature_size() image_feature_size = get_max_fuyu_image_tokens(ctx) - token_ids = (array("I", [_IMAGE_TOKEN_ID]) * ncol + array("I", [_NEWLINE_TOKEN_ID])) * nrow + token_ids = (array("I", [_IMAGE_TOKEN_ID]) * ncol + + array("I", [_NEWLINE_TOKEN_ID])) * nrow token_ids += array("I", [0]) * (seq_len - image_feature_size) return SequenceData(token_ids) diff --git a/vllm/worker/worker.py b/vllm/worker/worker.py index 07d2161941c..90e24f86780 100644 --- a/vllm/worker/worker.py +++ b/vllm/worker/worker.py @@ -298,7 +298,7 @@ def execute_worker(self, worker_input: WorkerInput) -> None: def _get_cached_seq_group_metadata( self, seq_group_metadata_list: List[Union[SequenceGroupMetadata, - SequenceGroupMetadataDelta]], + SequenceGroupMetadataDelta]], finished_request_ids: List[str]) -> List[SequenceGroupMetadata]: """Return a list of cached Sequence Group Metadata after updating its state. From d91aa788ca5155e1b7c0819d82ab3e69742a6c22 Mon Sep 17 00:00:00 2001 From: sang Date: Sun, 11 Aug 2024 21:04:39 -0700 Subject: [PATCH 21/36] lint --- vllm/executor/msgspec_utils.py | 2 +- vllm/executor/ray_gpu_executor.py | 3 +-- vllm/executor/ray_utils.py | 7 +++---- 3 files changed, 5 insertions(+), 7 deletions(-) diff --git a/vllm/executor/msgspec_utils.py b/vllm/executor/msgspec_utils.py index c403071698a..cb634c6f74e 100644 --- a/vllm/executor/msgspec_utils.py +++ b/vllm/executor/msgspec_utils.py @@ -1,5 +1,5 @@ -from typing import Any, Type from array import array +from typing import Any, Type def encode_hook(obj: Any) -> Any: diff --git a/vllm/executor/ray_gpu_executor.py b/vllm/executor/ray_gpu_executor.py index 0d8df875174..306cd70f060 100644 --- a/vllm/executor/ray_gpu_executor.py +++ b/vllm/executor/ray_gpu_executor.py @@ -1,6 +1,5 @@ import asyncio import os -from array import array from collections import defaultdict from itertools import islice, repeat from typing import TYPE_CHECKING, Any, Dict, List, Optional, Tuple @@ -10,8 +9,8 @@ import vllm.envs as envs from vllm.executor.distributed_gpu_executor import ( # yapf: disable DistributedGPUExecutor, DistributedGPUExecutorAsync) -from vllm.executor.ray_utils import RayWorkerWrapper, ray from vllm.executor.msgspec_utils import encode_hook +from vllm.executor.ray_utils import RayWorkerWrapper, ray from vllm.logger import init_logger from vllm.sequence import ExecuteModelRequest, SamplerOutput from vllm.utils import (_run_task_with_lock, get_distributed_init_method, diff --git a/vllm/executor/ray_utils.py b/vllm/executor/ray_utils.py index 6670d1c1e32..39eb8b0f74a 100644 --- a/vllm/executor/ray_utils.py +++ b/vllm/executor/ray_utils.py @@ -1,11 +1,10 @@ -from array import array -from typing import Any, List, Optional, Tuple, Type, Union +from typing import List, Optional, Tuple, Union import msgspec from vllm.config import ParallelConfig +from vllm.executor.msgspec_utils import decode_hook, encode_hook from vllm.logger import init_logger -from vllm.executor.msgspec_utils import encode_hook, decode_hook from vllm.sequence import ExecuteModelRequest, IntermediateTensors from vllm.utils import get_ip, is_hip, is_tpu, is_xpu from vllm.worker.worker_base import WorkerWrapperBase @@ -72,7 +71,7 @@ def execute_model_spmd( intermediate_tensors) # Pipeline model request and output to the next pipeline stage. if isinstance(output, IntermediateTensors): - output = serialized_data, output + output = serialized_req, output else: output = self.output_encoder.encode(output) From 1af8dc2908157c4f78f33ff64458dbc2ebf5d0ba Mon Sep 17 00:00:00 2001 From: sang Date: Sun, 11 Aug 2024 22:12:02 -0700 Subject: [PATCH 22/36] ip --- .buildkite/test-pipeline.yaml | 1 - tests/core/test_serialization.py | 19 +++++++++++++++ .../test_basic_distributed_correctness.py | 22 ++++++++++-------- tests/distributed/test_pipeline_parallel.py | 22 ------------------ tests/prompts/example.txt | 2 +- tests/samplers/test_sampler.py | 11 +++++---- tests/spec_decode/utils.py | 5 ++-- tests/test_logits_processor.py | 3 ++- tests/test_sequence.py | 3 ++- .../test_encoder_decoder_model_runner.py | 9 ++++---- tests/worker/test_model_runner.py | 9 ++++---- vllm/core/scheduler.py | 23 ++++++++++++++++++- vllm/executor/msgspec_utils.py | 9 ++++++++ vllm/multimodal/base.py | 6 ++--- vllm/multimodal/image.py | 6 ++--- vllm/multimodal/registry.py | 8 +++---- vllm/sampling_params.py | 1 - vllm/sequence.py | 9 ++++++-- 18 files changed, 103 insertions(+), 65 deletions(-) create mode 100644 tests/core/test_serialization.py diff --git a/.buildkite/test-pipeline.yaml b/.buildkite/test-pipeline.yaml index eb91b3936ed..6e83c887f89 100644 --- a/.buildkite/test-pipeline.yaml +++ b/.buildkite/test-pipeline.yaml @@ -294,7 +294,6 @@ steps: - pytest -v -s distributed/test_chunked_prefill_distributed.py - pytest -v -s distributed/test_multimodal_broadcast.py - pytest -v -s spec_decode/e2e/test_integration_dist_tp2.py - - DISTRIBUTED_EXECUTOR_BACKEND=ray VLLM_USE_RAY_SPMD_WORKER=1 VLLM_USE_RAY_COMPILED_DAG=1 pytest -v -s spec_decode/e2e/test_integration_dist_tp2.py - CUDA_VISIBLE_DEVICES=0,1 pytest -v -s test_sharded_state_loader.py - CUDA_VISIBLE_DEVICES=0,1 pytest -v -s distributed/test_utils.py diff --git a/tests/core/test_serialization.py b/tests/core/test_serialization.py new file mode 100644 index 00000000000..9afe9f515d2 --- /dev/null +++ b/tests/core/test_serialization.py @@ -0,0 +1,19 @@ +import msgspec +from vllm.executor.msgspec_utils import encode_hook, decode_hook +from vllm.sequence import ExecuteModelRequest +from ..spec_decode.utils import create_batch + +def test_msgspec_serialization(): + num_lookahead_slots = 4 + seq_group_metadata_list, _, _ = create_batch(16, num_lookahead_slots) + execute_model_req = ExecuteModelRequest( + seq_group_metadata_list=seq_group_metadata_list, + num_lookahead_slots=num_lookahead_slots, + running_queue_size=4) + + encoder = msgspec.msgpack.Encoder(enc_hook=encode_hook) + decoder = msgspec.msgpack.Decoder( + ExecuteModelRequest, dec_hook=decode_hook) + req = decoder.decode(encoder.encode(execute_model_req)) + assert (len(req.seq_group_metadata_list) + == len(execute_model_req.seq_group_metadata_list)) diff --git a/tests/distributed/test_basic_distributed_correctness.py b/tests/distributed/test_basic_distributed_correctness.py index 1de2ebab22d..22e74cacc75 100644 --- a/tests/distributed/test_basic_distributed_correctness.py +++ b/tests/distributed/test_basic_distributed_correctness.py @@ -22,15 +22,16 @@ @pytest.mark.skipif(cuda_device_count_stateless() < 2, reason="Need at least 2 GPUs to run the test.") @pytest.mark.parametrize( - "model, distributed_executor_backend, attention_backend, test_suite", [ - ("facebook/opt-125m", "ray", "", "L4"), - ("facebook/opt-125m", "mp", "", "L4"), - ("meta-llama/Llama-2-7b-hf", "ray", "", "L4"), - ("meta-llama/Llama-2-7b-hf", "mp", "", "L4"), - ("facebook/opt-125m", "ray", "", "A100"), - ("facebook/opt-125m", "mp", "", "A100"), - ("facebook/opt-125m", "mp", "FLASHINFER", "A100"), - ("meta-llama/Meta-Llama-3-8B", "ray", "FLASHINFER", "A100"), + "model, distributed_executor_backend, attention_backend, test_suite, enable_adag", [ + # ("facebook/opt-125m", "ray", "", "L4", False), + ("facebook/opt-125m", "ray", "", "L4", True), + # ("facebook/opt-125m", "mp", "", "L4", False), + # ("meta-llama/Llama-2-7b-hf", "ray", "", "L4", False), + # ("meta-llama/Llama-2-7b-hf", "mp", "", "L4", False), + # ("facebook/opt-125m", "ray", "", "A100", False), + # ("facebook/opt-125m", "mp", "", "A100", False), + # ("facebook/opt-125m", "mp", "FLASHINFER", "A100", False), + # ("meta-llama/Meta-Llama-3-8B", "ray", "FLASHINFER", "A100", False), ]) @fork_new_process_for_each_test def test_models( @@ -41,12 +42,13 @@ def test_models( distributed_executor_backend: str, attention_backend: str, test_suite: str, + enable_adag: bool, ) -> None: if test_suite != TARGET_TEST_SUITE: pytest.skip(f"Skip test for {test_suite}") - if model == "meta-llama/Llama-2-7b-hf" and distributed_executor_backend == "ray" and attention_backend == "" and test_suite == "L4": # noqa + if enable_adag: # test ray adag os.environ['VLLM_USE_RAY_SPMD_WORKER'] = "1" os.environ['VLLM_USE_RAY_COMPILED_DAG'] = "1" diff --git a/tests/distributed/test_pipeline_parallel.py b/tests/distributed/test_pipeline_parallel.py index 85a74eb8f73..8eb5ca9461c 100644 --- a/tests/distributed/test_pipeline_parallel.py +++ b/tests/distributed/test_pipeline_parallel.py @@ -14,27 +14,6 @@ VLLM_MULTI_NODE = os.getenv("VLLM_MULTI_NODE", "0") == "1" -<<<<<<< HEAD -@pytest.mark.parametrize( - ("TP_SIZE, PP_SIZE, EAGER_MODE, CHUNKED_PREFILL, " - "MODEL_NAME, DIST_BACKEND, USE_RAY_ADAG, USE_RAY_ADAG_NCCL"), [ - (2, 2, 0, 1, "meta-llama/Meta-Llama-3-8B", "ray", False, False), - (2, 2, 1, 0, "meta-llama/Meta-Llama-3-8B", "ray", False, False), - (1, 3, 0, 0, "meta-llama/Meta-Llama-3-8B", "ray", False, False), - (1, 4, 0, 1, "meta-llama/Meta-Llama-3-8B", "ray", False, False), - (1, 4, 1, 0, "meta-llama/Meta-Llama-3-8B", "ray", False, False), - (2, 2, 0, 1, "meta-llama/Meta-Llama-3-8B", "ray", True, True), - (2, 2, 1, 0, "meta-llama/Meta-Llama-3-8B", "ray", True, True), - (1, 3, 0, 0, "meta-llama/Meta-Llama-3-8B", "ray", True, True), - (1, 4, 0, 1, "meta-llama/Meta-Llama-3-8B", "ray", True, True), - (1, 2, 1, 0, "meta-llama/Meta-Llama-3-8B", "ray", True, True), - (2, 2, 0, 1, "meta-llama/Meta-Llama-3-8B", "mp", False, False), - (2, 2, 1, 0, "meta-llama/Meta-Llama-3-8B", "mp", False, False), - (1, 3, 0, 0, "meta-llama/Meta-Llama-3-8B", "mp", False, False), - (1, 4, 0, 1, "meta-llama/Meta-Llama-3-8B", "mp", False, False), - (1, 4, 1, 0, "meta-llama/Meta-Llama-3-8B", "mp", False, False), - ]) -======= @pytest.mark.parametrize(("TP_SIZE, PP_SIZE, EAGER_MODE, CHUNKED_PREFILL, " "MODEL_NAME, DIST_BACKEND"), [ @@ -49,7 +28,6 @@ (1, 4, 0, 1, "meta-llama/Meta-Llama-3-8B", "mp"), (1, 4, 1, 0, "meta-llama/Meta-Llama-3-8B", "mp"), ]) ->>>>>>> main def test_compare_tp(TP_SIZE, PP_SIZE, EAGER_MODE, CHUNKED_PREFILL, MODEL_NAME, DIST_BACKEND): if VLLM_MULTI_NODE and DIST_BACKEND == "mp": diff --git a/tests/prompts/example.txt b/tests/prompts/example.txt index cef4d1d7687..e1b97bc6eee 100644 --- a/tests/prompts/example.txt +++ b/tests/prompts/example.txt @@ -5,4 +5,4 @@ Describe the basic components of a neural network and how it can be trained. Write a short story about a robot that dreams for the first time. Analyze the impact of the COVID-19 pandemic on global economic structures and future business models. Explain the cultural significance of the Mona Lisa painting, and how its perception might vary in Western versus Eastern societies. -Translate the following English sentence into Japanese, French, and Swahili: 'The early bird catches the worm.' \ No newline at end of file +Translate the following English sentence into Japanese, French, and Swahili: 'The early bird catches the worm.' diff --git a/tests/samplers/test_sampler.py b/tests/samplers/test_sampler.py index f1370e41124..565daf6ba78 100644 --- a/tests/samplers/test_sampler.py +++ b/tests/samplers/test_sampler.py @@ -2,6 +2,7 @@ import random from typing import Dict, List, Optional, Tuple from unittest.mock import Mock, patch +from array import array import pytest import torch @@ -56,7 +57,7 @@ def _do_sample( SequenceGroupMetadata( request_id=f"test_{i}", is_prompt=True, - seq_data={0: SequenceData([1, 2, 3])}, + seq_data={0: SequenceData(array("I", [1, 2, 3]))}, sampling_params=sampling_params, block_tables={0: [1]}, )) @@ -201,7 +202,7 @@ def create_sampling_params(min_tokens, def create_sequence_data(num_input=3, num_generated=0): seq_data = SequenceData( - random.choices(range(0, VOCAB_SIZE), k=num_input)) + array("I", random.choices(range(0, VOCAB_SIZE), k=num_input))) if num_generated > 0: seq_data.output_token_ids = random.choices(range(0, VOCAB_SIZE), k=num_generated) @@ -504,7 +505,7 @@ def test_sampler_mixed(seed: int, device: str): SequenceGroupMetadata( request_id=f"test_{i}", is_prompt=True, - seq_data={0: SequenceData([1, 2, 3])}, + seq_data={0: SequenceData(array("I", [1, 2, 3]))}, sampling_params=sampling_params, block_tables={0: [1]}, )) @@ -600,7 +601,7 @@ def test_sampler_top_k_top_p(seed: int, device: str): SequenceGroupMetadata( request_id=f"test_{i}", is_prompt=True, - seq_data={0: SequenceData([1, 2, 3])}, + seq_data={0: SequenceData(array("I", [1, 2, 3]))}, sampling_params=SamplingParams( temperature=1, top_k=top_k, @@ -650,7 +651,7 @@ def test_sampling_params(sampling_params: List[SamplingParams]): SequenceGroupMetadata( request_id=f"test_{i}", is_prompt=True, - seq_data={0: SequenceData([1, 2, 3])}, + seq_data={0: SequenceData(array("I", [1, 2, 3]))}, sampling_params=sampling_params[i], block_tables={0: [1]}, )) diff --git a/tests/spec_decode/utils.py b/tests/spec_decode/utils.py index 86148291ae6..9e939996d0b 100644 --- a/tests/spec_decode/utils.py +++ b/tests/spec_decode/utils.py @@ -3,6 +3,7 @@ from typing import Sequence as GenericSequence from typing import TypeVar, Union from unittest.mock import MagicMock +from array import array import torch @@ -138,8 +139,8 @@ def create_seq_group_metadata_from_prompts( seq_data={ i: SequenceData( - prompt_token_ids=prompt_token_ids[:], - output_token_ids=cont_token_ids[:], + array("I", prompt_token_ids[:]), + _output_token_ids=array("I", cont_token_ids[:]), ), }, sampling_params=SamplingParams(temperature=0.0, ), diff --git a/tests/test_logits_processor.py b/tests/test_logits_processor.py index 8ee2d78190c..20ff59db7b8 100644 --- a/tests/test_logits_processor.py +++ b/tests/test_logits_processor.py @@ -1,6 +1,7 @@ import random from typing import Tuple from unittest.mock import patch +from array import array import pytest import torch @@ -69,7 +70,7 @@ def pick_ith(token_ids, logits): SequenceGroupMetadata( request_id=f"test_{i}", is_prompt=True, - seq_data={0: SequenceData([1, 2, 3])}, + seq_data={0: SequenceData(array("I", [1, 2, 3]))}, sampling_params=SamplingParams(temperature=0, logits_processors=[pick_ith]), block_tables={0: [1]}, diff --git a/tests/test_sequence.py b/tests/test_sequence.py index 3136402518b..19e02c30798 100644 --- a/tests/test_sequence.py +++ b/tests/test_sequence.py @@ -1,5 +1,6 @@ import pytest +from array import array from vllm.sequence import (CompletionSequenceGroupOutput, SamplerOutput, SequenceData, SequenceOutput) @@ -54,7 +55,7 @@ def test_sampler_output_eq(sample_outputs): def test_sequence_data_prefill(): - seq_data = SequenceData(prompt_token_ids=[1, 2, 3, 4]) + seq_data = SequenceData(array("I", [1, 2, 3, 4])) assert seq_data.get_num_uncomputed_tokens() == 4 assert seq_data.get_num_computed_tokens() == 0 # advance by 2 diff --git a/tests/worker/test_encoder_decoder_model_runner.py b/tests/worker/test_encoder_decoder_model_runner.py index 8a2e9b81580..4e2c6291dd5 100644 --- a/tests/worker/test_encoder_decoder_model_runner.py +++ b/tests/worker/test_encoder_decoder_model_runner.py @@ -2,6 +2,7 @@ import pytest import torch +from array import array from vllm.engine.arg_utils import EngineArgs from vllm.sequence import SamplingParams, SequenceData, SequenceGroupMetadata @@ -125,10 +126,10 @@ def test_prepare_prompt( # make sure all tokens fit into one block seq_len = i % (model_runner.block_size - 1) + 1 seq_lens.append(seq_len) - seq_data = SequenceData(list(range(seq_len))) + seq_data = SequenceData(array("I", range(seq_len)))) encoder_seq_len = (i + 1) % (model_runner.block_size - 1) + 1 encoder_seq_lens.append(encoder_seq_len) - encoder_seq_data = SequenceData(list(range(encoder_seq_len))) + encoder_seq_data = SequenceData(array("I", range(encoder_seq_len)))) seq_group_metadata = SequenceGroupMetadata( request_id=f"test_{i}", is_prompt=True, @@ -319,10 +320,10 @@ def test_prepare_decode( # make sure all tokens fit into one block seq_len = i % (model_runner.block_size - 1) + 1 seq_lens.append(seq_len) - seq_data = SequenceData(list(range(seq_len))) + seq_data = SequenceData(array("I", (range(seq_len)))) encoder_seq_len = (i + 1) % (model_runner.block_size - 1) + 1 encoder_seq_lens.append(encoder_seq_len) - encoder_seq_data = SequenceData(list(range(encoder_seq_len))) + encoder_seq_data = SequenceData(array("I", (range(encoder_seq_len)))) seq_group_metadata = SequenceGroupMetadata( request_id=f"test_{i}", is_prompt=False, diff --git a/tests/worker/test_model_runner.py b/tests/worker/test_model_runner.py index 4d2edc02139..8b12df263a1 100644 --- a/tests/worker/test_model_runner.py +++ b/tests/worker/test_model_runner.py @@ -2,6 +2,7 @@ import pytest import torch +from array import array from vllm.distributed.parallel_state import (ensure_model_parallel_initialized, init_distributed_environment) @@ -46,7 +47,7 @@ def test_prepare_prompt(batch_size): # make sure all tokens fit into one block seq_len = i % (model_runner.block_size - 1) + 1 seq_lens.append(seq_len) - seq_data = SequenceData(list(range(seq_len))) + seq_data = SequenceData(array("I", range(seq_len)))) seq_group_metadata = SequenceGroupMetadata( request_id=f"test_{i}", is_prompt=True, @@ -163,7 +164,7 @@ def test_prepare_decode_cuda_graph(batch_size): # make sure all tokens fit into one block context_len = i % (model_runner.block_size - 1) + 1 context_lens.append(context_len) - seq_data = SequenceData(list(range(context_len))) + seq_data = SequenceData(array("I", range(context_len))) seq_data.update_num_computed_tokens(context_len) # Append one token ID since prefill is finished. seq_data.append_token_id(1, 0) @@ -324,7 +325,7 @@ def test_hybrid_batches(batch_size, enforce_eager, distributed_init): # make sure all tokens fit into one block seq_len = i % (model_runner.block_size - 1) + 1 seq_lens.append(seq_len) - seq_data = SequenceData(list(range(seq_len))) + seq_data = SequenceData(array("I", range(seq_len)))) seq_group_metadata = SequenceGroupMetadata( request_id=f"test_{i}", is_prompt=True, @@ -340,7 +341,7 @@ def test_hybrid_batches(batch_size, enforce_eager, distributed_init): for i in range(prefill_batch_size, batch_size): # make sure all tokens fit into one block context_len = i % (model_runner.block_size - 1) + 1 - prompt_toks = list(range(context_len)) + prompt_toks = array("I", range(context_len))) seq_data = SequenceData(prompt_toks) seq_data.append_token_id(1, 0) seq_data.update_num_computed_tokens(context_len) diff --git a/vllm/core/scheduler.py b/vllm/core/scheduler.py index 85d56e039f6..d9270119139 100644 --- a/vllm/core/scheduler.py +++ b/vllm/core/scheduler.py @@ -1101,7 +1101,28 @@ def schedule(self) -> Tuple[List[SequenceGroupMetadata], SchedulerOutputs]: # It assumes the scheduled_seq_groups is ordered by # prefill < decoding. if is_first_prefill or not self.scheduler_config._send_delta_data: - seq_group_metadata.__init__( + # seq_group_metadata.__init__( + # request_id=seq_group.request_id, + # is_prompt=is_prompt, + # seq_data=seq_data, + # sampling_params=seq_group.sampling_params, + # block_tables=block_tables, + # do_sample=do_sample, + # pooling_params=seq_group.pooling_params, + # token_chunk_size=token_chunk_size, + # lora_request=seq_group.lora_request, + # computed_block_nums=common_computed_block_nums, + # encoder_seq_data=encoder_seq_data, + # cross_block_table=cross_block_table, + # # `multi_modal_data` will only be present for the 1st comm + # # between engine and worker. + # # the subsequent comms can still use delta, but + # # `multi_modal_data` will be None. + # multi_modal_data=seq_group.multi_modal_data + # if scheduler_outputs.num_prefill_groups > 0 else None, + # prompt_adapter_request=seq_group.prompt_adapter_request, + # ) + seq_group_metadata = SequenceGroupMetadata( request_id=seq_group.request_id, is_prompt=is_prompt, seq_data=seq_data, diff --git a/vllm/executor/msgspec_utils.py b/vllm/executor/msgspec_utils.py index cb634c6f74e..c4a6844303a 100644 --- a/vllm/executor/msgspec_utils.py +++ b/vllm/executor/msgspec_utils.py @@ -1,5 +1,6 @@ from array import array from typing import Any, Type +from vllm.sequence import SequenceData def encode_hook(obj: Any) -> Any: @@ -9,6 +10,11 @@ def encode_hook(obj: Any) -> Any: """ if isinstance(obj, array): return obj.tobytes() + if isinstance(obj, SequenceData): + # This can be reconstructed from __post_init__. + obj._prompt_token_ids_tuple = tuple() + obj._cached_all_token_ids = [] + obj._new_appended_tokens = [] else: raise ValueError(f"Unsupported serialization type: {type(obj)}") @@ -22,5 +28,8 @@ def decode_hook(type: Type, obj: Any) -> Any: deserialized = array('I') deserialized.frombytes(obj) return deserialized + if isinstance(obj, SequenceData): + obj.__post_init__() + return obj else: raise ValueError(f"Unsupported deserialization type: {type(obj)}") diff --git a/vllm/multimodal/base.py b/vllm/multimodal/base.py index 7a4453e21fe..cd4113e13b2 100644 --- a/vllm/multimodal/base.py +++ b/vllm/multimodal/base.py @@ -11,7 +11,7 @@ from torch import nn from typing_extensions import TypeAlias -# from vllm.config import ModelConfig +from vllm.config import ModelConfig from vllm.inputs import InputContext from vllm.logger import init_logger from vllm.utils import JSONTree, json_map_leaves @@ -220,7 +220,7 @@ def wrapper(model_cls: N) -> N: return wrapper - def map_input(self, model_config: Any, data: object) -> MultiModalInputs: + def map_input(self, model_config: ModelConfig, data: object) -> MultiModalInputs: """ Transform the data into a dictionary of model inputs using the input mapper registered for that model. @@ -290,7 +290,7 @@ def wrapper(model_cls: N) -> N: return wrapper - def get_max_multimodal_tokens(self, model_config: Any) -> int: + def get_max_multimodal_tokens(self, model_config: ModelConfig) -> int: """ Get the maximum number of multi-modal tokens for profiling the memory usage of a model. diff --git a/vllm/multimodal/image.py b/vllm/multimodal/image.py index 34dc6a91dc8..db50229bda3 100644 --- a/vllm/multimodal/image.py +++ b/vllm/multimodal/image.py @@ -1,11 +1,11 @@ from functools import lru_cache -from typing import Any, List, Optional, Tuple, TypeVar +from typing import List, Optional, Tuple, TypeVar import torch from PIL import Image from transformers import PreTrainedTokenizerBase -# from vllm.config import ModelConfig +from vllm.config import ModelConfig from vllm.inputs.registry import InputContext from vllm.logger import init_logger from vllm.transformers_utils.image_processor import get_image_processor @@ -106,7 +106,7 @@ class ImagePlugin(MultiModalPlugin): def get_data_key(self) -> str: return "image" - def _get_hf_image_processor(self, model_config: Any): + def _get_hf_image_processor(self, model_config: ModelConfig): return cached_get_image_processor( model_config.model, trust_remote_code=model_config.trust_remote_code) diff --git a/vllm/multimodal/registry.py b/vllm/multimodal/registry.py index 798ba2982dc..4fa9f2954e6 100644 --- a/vllm/multimodal/registry.py +++ b/vllm/multimodal/registry.py @@ -3,7 +3,7 @@ import torch -# from vllm.config import ModelConfig +from vllm.config import ModelConfig from vllm.logger import init_logger from .base import (MultiModalDataDict, MultiModalInputMapper, MultiModalInputs, @@ -75,7 +75,7 @@ def register_image_input_mapper( """ return self.register_input_mapper("image", mapper) - def map_input(self, model_config: Any, + def map_input(self, model_config: ModelConfig, data: MultiModalDataDict) -> MultiModalInputs: """ Apply an input mapper to the data passed to the model. @@ -102,7 +102,7 @@ def map_input(self, model_config: Any, return MultiModalInputs(merged_dict) - def create_input_mapper(self, model_config: Any): + def create_input_mapper(self, model_config: ModelConfig): """ Create an input mapper (see :meth:`map_input`) for a specific model. """ @@ -130,7 +130,7 @@ def register_max_image_tokens( """ return self.register_max_multimodal_tokens("image", max_mm_tokens) - def get_max_multimodal_tokens(self, model_config: Any) -> int: + def get_max_multimodal_tokens(self, model_config: ModelConfig) -> int: """ Get the maximum number of multi-modal tokens for profiling the memory usage of a model. diff --git a/vllm/sampling_params.py b/vllm/sampling_params.py index 2e156b0c021..5a1780da299 100644 --- a/vllm/sampling_params.py +++ b/vllm/sampling_params.py @@ -138,7 +138,6 @@ class SamplingParams(msgspec.Struct, omit_defaults=True): spaces_between_special_tokens: bool = True # Optional[List[LogitsProcessor]] type. We use Any here because # Optional[List[LogitsProcessor]] type is not supported by msgspec. - # We will also remove this API soon. logits_processors: Optional[Any] = None include_stop_str_in_output: bool = False truncate_prompt_tokens: Optional[Annotated[int, msgspec.Meta(ge=1)]] = None diff --git a/vllm/sequence.py b/vllm/sequence.py index 7fea1e215c5..1c6f0cf4878 100644 --- a/vllm/sequence.py +++ b/vllm/sequence.py @@ -13,7 +13,6 @@ from vllm.inputs.parse import is_valid_encoder_decoder_llm_inputs from vllm.lora.request import LoRARequest -from vllm.multimodal.base import MultiModalDataDict from vllm.pooling_params import PoolingParams from vllm.prompt_adapter.request import PromptAdapterRequest from vllm.sampling_params import SamplingParams @@ -21,6 +20,7 @@ if TYPE_CHECKING: from vllm.inputs import LLMInputs + from vllm.multimodal.base import MultiModalDataDict class Logprob(msgspec.Struct, omit_defaults=True, array_like=True): @@ -112,6 +112,8 @@ class RequestMetrics: class SequenceDataDelta(msgspec.Struct, array_like=True, omit_defaults=True): + """Delta SequenceGroupData to send per new decode request. + """ new_output_token_ids: List[int] new_cumulative_logprob: float new_num_computed_tokens: int @@ -131,6 +133,8 @@ class SequenceData(msgspec.Struct, omit_defaults=True): output_token_ids: The token IDs of the output. cumulative_logprob: The cumulative log probability of the output. """ + # NOTE: we cannot use Union[List, array] because msgspec cannot support + # union of 2 list types. _prompt_token_ids: array _output_token_ids: array = msgspec.field( default_factory=lambda: array("I", [])) @@ -144,7 +148,8 @@ class SequenceData(msgspec.Struct, omit_defaults=True): _stage: SequenceStage = SequenceStage.PREFILL _cached_all_token_ids: List[int] = msgspec.field(default_factory=list) - # Below fields are used to get delta input. + # It is used to get delta input. It is reset when `reset_and_get_delta` + # is called. _new_appended_tokens: List[int] = msgspec.field(default_factory=list) def __post_init__(self, ) -> None: From 6e6ac92719cc28a9fb8024134738c28c4245ad52 Mon Sep 17 00:00:00 2001 From: sang Date: Sun, 11 Aug 2024 22:28:05 -0700 Subject: [PATCH 23/36] all working --- a.py | 4 +++ tests/core/test_serialization.py | 11 +++--- .../test_basic_distributed_correctness.py | 19 ++++++----- .../test_chunked_prefill_distributed.py | 2 +- vllm/core/scheduler.py | 34 ++----------------- vllm/executor/ray_gpu_executor.py | 2 ++ vllm/multimodal/base.py | 3 +- vllm/prompt_adapter/request.py | 6 ++-- vllm/sequence.py | 7 ++-- 9 files changed, 34 insertions(+), 54 deletions(-) create mode 100644 a.py diff --git a/a.py b/a.py new file mode 100644 index 00000000000..846b9239a45 --- /dev/null +++ b/a.py @@ -0,0 +1,4 @@ +from vllm.prompt_adapter.request import PromptAdapterRequest + +r = PromptAdapterRequest("a", 1, "a", 1) +r.__hash__() \ No newline at end of file diff --git a/tests/core/test_serialization.py b/tests/core/test_serialization.py index 9afe9f515d2..cfa67b0f7e7 100644 --- a/tests/core/test_serialization.py +++ b/tests/core/test_serialization.py @@ -3,6 +3,7 @@ from vllm.sequence import ExecuteModelRequest from ..spec_decode.utils import create_batch + def test_msgspec_serialization(): num_lookahead_slots = 4 seq_group_metadata_list, _, _ = create_batch(16, num_lookahead_slots) @@ -10,10 +11,10 @@ def test_msgspec_serialization(): seq_group_metadata_list=seq_group_metadata_list, num_lookahead_slots=num_lookahead_slots, running_queue_size=4) - + encoder = msgspec.msgpack.Encoder(enc_hook=encode_hook) - decoder = msgspec.msgpack.Decoder( - ExecuteModelRequest, dec_hook=decode_hook) + decoder = msgspec.msgpack.Decoder(ExecuteModelRequest, + dec_hook=decode_hook) req = decoder.decode(encoder.encode(execute_model_req)) - assert (len(req.seq_group_metadata_list) - == len(execute_model_req.seq_group_metadata_list)) + assert (len(req.seq_group_metadata_list) == len( + execute_model_req.seq_group_metadata_list)) diff --git a/tests/distributed/test_basic_distributed_correctness.py b/tests/distributed/test_basic_distributed_correctness.py index 22e74cacc75..2d68ea623f4 100644 --- a/tests/distributed/test_basic_distributed_correctness.py +++ b/tests/distributed/test_basic_distributed_correctness.py @@ -22,16 +22,17 @@ @pytest.mark.skipif(cuda_device_count_stateless() < 2, reason="Need at least 2 GPUs to run the test.") @pytest.mark.parametrize( - "model, distributed_executor_backend, attention_backend, test_suite, enable_adag", [ - # ("facebook/opt-125m", "ray", "", "L4", False), + "model, distributed_executor_backend, attention_backend, test_suite, enable_adag", + [ + ("facebook/opt-125m", "ray", "", "L4", False), ("facebook/opt-125m", "ray", "", "L4", True), - # ("facebook/opt-125m", "mp", "", "L4", False), - # ("meta-llama/Llama-2-7b-hf", "ray", "", "L4", False), - # ("meta-llama/Llama-2-7b-hf", "mp", "", "L4", False), - # ("facebook/opt-125m", "ray", "", "A100", False), - # ("facebook/opt-125m", "mp", "", "A100", False), - # ("facebook/opt-125m", "mp", "FLASHINFER", "A100", False), - # ("meta-llama/Meta-Llama-3-8B", "ray", "FLASHINFER", "A100", False), + ("facebook/opt-125m", "mp", "", "L4", False), + ("meta-llama/Llama-2-7b-hf", "ray", "", "L4", False), + ("meta-llama/Llama-2-7b-hf", "mp", "", "L4", False), + ("facebook/opt-125m", "ray", "", "A100", False), + ("facebook/opt-125m", "mp", "", "A100", False), + ("facebook/opt-125m", "mp", "FLASHINFER", "A100", False), + ("meta-llama/Meta-Llama-3-8B", "ray", "FLASHINFER", "A100", False), ]) @fork_new_process_for_each_test def test_models( diff --git a/tests/distributed/test_chunked_prefill_distributed.py b/tests/distributed/test_chunked_prefill_distributed.py index 21d94f085a1..6976dbf8ec3 100644 --- a/tests/distributed/test_chunked_prefill_distributed.py +++ b/tests/distributed/test_chunked_prefill_distributed.py @@ -17,7 +17,7 @@ @pytest.mark.skipif(cuda_device_count_stateless() < 2, reason="Need at least 2 GPUs to run the test.") -@pytest.mark.parametrize("model, distributed_executor_backend", [ +@pytest.mark.parametrize("model, distributed_executor_backend, enable_adag", [ ("facebook/opt-125m", "ray", False), ("facebook/opt-125m", "ray", True), ("meta-llama/Llama-2-7b-hf", "ray", False), diff --git a/vllm/core/scheduler.py b/vllm/core/scheduler.py index d9270119139..2f39f5e00fe 100644 --- a/vllm/core/scheduler.py +++ b/vllm/core/scheduler.py @@ -364,8 +364,6 @@ def __init__( self.num_cumulative_preemption: int = 0 # Used to cache python objects - self._seq_group_metadata_cache: PyObjectCache = PyObjectCache( - seq_group_metadata_builder) self._scheduler_running_outputs_cache: PyObjectCache = PyObjectCache( scheduler_running_outputs_builder) self._scheduled_seq_group_cache: PyObjectCache = PyObjectCache( @@ -1046,15 +1044,10 @@ def schedule(self) -> Tuple[List[SequenceGroupMetadata], SchedulerOutputs]: token_chunk_size = scheduled_seq_group.token_chunk_size seq_group.maybe_set_first_scheduled_time(now) - seq_group_metadata = self._seq_group_metadata_cache.get_object() - seq_group_metadata.seq_data.clear() - seq_group_metadata.block_tables.clear() - # seq_id -> SequenceData - seq_data: Dict[int, SequenceData] = seq_group_metadata.seq_data + seq_data: Dict[int, SequenceData] = {} # seq_id -> physical block numbers - block_tables: Dict[int, - List[int]] = seq_group_metadata.block_tables + block_tables: Dict[int, List[int]] = {} if seq_group.is_encoder_decoder(): # Encoder associated with SequenceGroup @@ -1101,27 +1094,6 @@ def schedule(self) -> Tuple[List[SequenceGroupMetadata], SchedulerOutputs]: # It assumes the scheduled_seq_groups is ordered by # prefill < decoding. if is_first_prefill or not self.scheduler_config._send_delta_data: - # seq_group_metadata.__init__( - # request_id=seq_group.request_id, - # is_prompt=is_prompt, - # seq_data=seq_data, - # sampling_params=seq_group.sampling_params, - # block_tables=block_tables, - # do_sample=do_sample, - # pooling_params=seq_group.pooling_params, - # token_chunk_size=token_chunk_size, - # lora_request=seq_group.lora_request, - # computed_block_nums=common_computed_block_nums, - # encoder_seq_data=encoder_seq_data, - # cross_block_table=cross_block_table, - # # `multi_modal_data` will only be present for the 1st comm - # # between engine and worker. - # # the subsequent comms can still use delta, but - # # `multi_modal_data` will be None. - # multi_modal_data=seq_group.multi_modal_data - # if scheduler_outputs.num_prefill_groups > 0 else None, - # prompt_adapter_request=seq_group.prompt_adapter_request, - # ) seq_group_metadata = SequenceGroupMetadata( request_id=seq_group.request_id, is_prompt=is_prompt, @@ -1168,8 +1140,6 @@ def schedule(self) -> Tuple[List[SequenceGroupMetadata], SchedulerOutputs]: self.block_manager.mark_blocks_as_computed( scheduled_seq_group.seq_group) - self._seq_group_metadata_cache.reset() - scheduler_time = time.perf_counter() - scheduler_start_time # Add this to scheduler time to all the sequences that are currently # running. This will help estimate if the scheduler is a significant diff --git a/vllm/executor/ray_gpu_executor.py b/vllm/executor/ray_gpu_executor.py index 306cd70f060..e5fc52cd661 100644 --- a/vllm/executor/ray_gpu_executor.py +++ b/vllm/executor/ray_gpu_executor.py @@ -121,6 +121,8 @@ def _init_workers_ray(self, placement_group: "PlacementGroup", ray_remote_kwargs = self._configure_ray_workers_use_nsight( ray_remote_kwargs) + logger.info("use_ray_spmd_worker: %s", self.use_ray_spmd_worker) + # Create the workers. driver_ip = get_ip() worker_wrapper_kwargs = self._get_worker_wrapper_args() diff --git a/vllm/multimodal/base.py b/vllm/multimodal/base.py index cd4113e13b2..aefb5f438c5 100644 --- a/vllm/multimodal/base.py +++ b/vllm/multimodal/base.py @@ -220,7 +220,8 @@ def wrapper(model_cls: N) -> N: return wrapper - def map_input(self, model_config: ModelConfig, data: object) -> MultiModalInputs: + def map_input(self, model_config: ModelConfig, + data: object) -> MultiModalInputs: """ Transform the data into a dictionary of model inputs using the input mapper registered for that model. diff --git a/vllm/prompt_adapter/request.py b/vllm/prompt_adapter/request.py index 499dd5114d5..26ab043bbea 100644 --- a/vllm/prompt_adapter/request.py +++ b/vllm/prompt_adapter/request.py @@ -3,8 +3,10 @@ from vllm.adapter_commons.request import AdapterRequest -class PromptAdapterRequest(msgspec.Struct, array_like=True, - omit_defaults=True): +class PromptAdapterRequest(msgspec.Struct, + array_like=True, + omit_defaults=True, + frozen=True): """ Request for a Prompt adapter. """ diff --git a/vllm/sequence.py b/vllm/sequence.py index 1c6f0cf4878..5f020a92665 100644 --- a/vllm/sequence.py +++ b/vllm/sequence.py @@ -5,8 +5,8 @@ from array import array from collections import defaultdict from dataclasses import dataclass -from typing import (TYPE_CHECKING, Any, Dict, List, Mapping, Optional, Set, Tuple, - Union, cast) +from typing import (TYPE_CHECKING, Any, Dict, List, Mapping, Optional, Set, + Tuple, Union, cast) import msgspec import torch @@ -112,8 +112,7 @@ class RequestMetrics: class SequenceDataDelta(msgspec.Struct, array_like=True, omit_defaults=True): - """Delta SequenceGroupData to send per new decode request. - """ + """Delta SequenceGroupData to send to workers.""" new_output_token_ids: List[int] new_cumulative_logprob: float new_num_computed_tokens: int From fa0d077b966c43fa10fa086ea04e9d59af9d07bd Mon Sep 17 00:00:00 2001 From: sang Date: Sun, 11 Aug 2024 22:30:46 -0700 Subject: [PATCH 24/36] lint --- tests/distributed/test_basic_distributed_correctness.py | 3 ++- tests/worker/test_encoder_decoder_model_runner.py | 4 ++-- tests/worker/test_model_runner.py | 6 +++--- vllm/model_executor/models/paligemma.py | 2 +- vllm/multimodal/registry.py | 2 +- 5 files changed, 9 insertions(+), 8 deletions(-) diff --git a/tests/distributed/test_basic_distributed_correctness.py b/tests/distributed/test_basic_distributed_correctness.py index 2d68ea623f4..9761c4ce147 100644 --- a/tests/distributed/test_basic_distributed_correctness.py +++ b/tests/distributed/test_basic_distributed_correctness.py @@ -22,7 +22,8 @@ @pytest.mark.skipif(cuda_device_count_stateless() < 2, reason="Need at least 2 GPUs to run the test.") @pytest.mark.parametrize( - "model, distributed_executor_backend, attention_backend, test_suite, enable_adag", + "model, distributed_executor_backend, attention_backend, " + "test_suite, enable_adag", [ ("facebook/opt-125m", "ray", "", "L4", False), ("facebook/opt-125m", "ray", "", "L4", True), diff --git a/tests/worker/test_encoder_decoder_model_runner.py b/tests/worker/test_encoder_decoder_model_runner.py index 4e2c6291dd5..465cd4359ff 100644 --- a/tests/worker/test_encoder_decoder_model_runner.py +++ b/tests/worker/test_encoder_decoder_model_runner.py @@ -126,10 +126,10 @@ def test_prepare_prompt( # make sure all tokens fit into one block seq_len = i % (model_runner.block_size - 1) + 1 seq_lens.append(seq_len) - seq_data = SequenceData(array("I", range(seq_len)))) + seq_data = SequenceData(array("I", range(seq_len))) encoder_seq_len = (i + 1) % (model_runner.block_size - 1) + 1 encoder_seq_lens.append(encoder_seq_len) - encoder_seq_data = SequenceData(array("I", range(encoder_seq_len)))) + encoder_seq_data = SequenceData(array("I", range(encoder_seq_len))) seq_group_metadata = SequenceGroupMetadata( request_id=f"test_{i}", is_prompt=True, diff --git a/tests/worker/test_model_runner.py b/tests/worker/test_model_runner.py index 8b12df263a1..62053874c64 100644 --- a/tests/worker/test_model_runner.py +++ b/tests/worker/test_model_runner.py @@ -47,7 +47,7 @@ def test_prepare_prompt(batch_size): # make sure all tokens fit into one block seq_len = i % (model_runner.block_size - 1) + 1 seq_lens.append(seq_len) - seq_data = SequenceData(array("I", range(seq_len)))) + seq_data = SequenceData(array("I", range(seq_len))) seq_group_metadata = SequenceGroupMetadata( request_id=f"test_{i}", is_prompt=True, @@ -325,7 +325,7 @@ def test_hybrid_batches(batch_size, enforce_eager, distributed_init): # make sure all tokens fit into one block seq_len = i % (model_runner.block_size - 1) + 1 seq_lens.append(seq_len) - seq_data = SequenceData(array("I", range(seq_len)))) + seq_data = SequenceData(array("I", range(seq_len))) seq_group_metadata = SequenceGroupMetadata( request_id=f"test_{i}", is_prompt=True, @@ -341,7 +341,7 @@ def test_hybrid_batches(batch_size, enforce_eager, distributed_init): for i in range(prefill_batch_size, batch_size): # make sure all tokens fit into one block context_len = i % (model_runner.block_size - 1) + 1 - prompt_toks = array("I", range(context_len))) + prompt_toks = array("I", range(context_len)) seq_data = SequenceData(prompt_toks) seq_data.append_token_id(1, 0) seq_data.update_num_computed_tokens(context_len) diff --git a/vllm/model_executor/models/paligemma.py b/vllm/model_executor/models/paligemma.py index 0e3c7cd846b..85a3fc3f015 100644 --- a/vllm/model_executor/models/paligemma.py +++ b/vllm/model_executor/models/paligemma.py @@ -36,7 +36,7 @@ def get_max_paligemma_image_tokens(ctx: InputContext): hf_config = ctx.get_hf_config(PaliGemmaConfig) vision_config = hf_config.vision_config - return text_config.num_image_tokens + return get_max_siglip_image_tokens(vision_config) def dummy_seq_data_for_paligemma( diff --git a/vllm/multimodal/registry.py b/vllm/multimodal/registry.py index 4fa9f2954e6..d8e1b68178a 100644 --- a/vllm/multimodal/registry.py +++ b/vllm/multimodal/registry.py @@ -1,5 +1,5 @@ import functools -from typing import Any, Dict, Optional, Sequence +from typing import Dict, Optional, Sequence import torch From b5a88ec3105a73771bfdf05d1c139b6820f38515 Mon Sep 17 00:00:00 2001 From: sang Date: Sun, 11 Aug 2024 22:32:22 -0700 Subject: [PATCH 25/36] done --- tests/core/test_serialization.py | 4 ++- .../test_basic_distributed_correctness.py | 3 +- .../test_chunked_prefill_distributed.py | 1 + tests/samplers/test_sampler.py | 2 +- tests/spec_decode/utils.py | 2 +- tests/test_logits_processor.py | 2 +- tests/test_sequence.py | 3 +- .../test_encoder_decoder_model_runner.py | 2 +- tests/worker/test_model_runner.py | 2 +- vllm/executor/msgspec_utils.py | 1 + vllm/model_executor/models/paligemma.py | 34 ------------------- vllm/model_executor/models/siglip.py | 3 +- 12 files changed, 15 insertions(+), 44 deletions(-) diff --git a/tests/core/test_serialization.py b/tests/core/test_serialization.py index cfa67b0f7e7..1a07fd4fee9 100644 --- a/tests/core/test_serialization.py +++ b/tests/core/test_serialization.py @@ -1,6 +1,8 @@ import msgspec -from vllm.executor.msgspec_utils import encode_hook, decode_hook + +from vllm.executor.msgspec_utils import decode_hook, encode_hook from vllm.sequence import ExecuteModelRequest + from ..spec_decode.utils import create_batch diff --git a/tests/distributed/test_basic_distributed_correctness.py b/tests/distributed/test_basic_distributed_correctness.py index 9761c4ce147..f6bf3bb60d6 100644 --- a/tests/distributed/test_basic_distributed_correctness.py +++ b/tests/distributed/test_basic_distributed_correctness.py @@ -23,8 +23,7 @@ reason="Need at least 2 GPUs to run the test.") @pytest.mark.parametrize( "model, distributed_executor_backend, attention_backend, " - "test_suite, enable_adag", - [ + "test_suite, enable_adag", [ ("facebook/opt-125m", "ray", "", "L4", False), ("facebook/opt-125m", "ray", "", "L4", True), ("facebook/opt-125m", "mp", "", "L4", False), diff --git a/tests/distributed/test_chunked_prefill_distributed.py b/tests/distributed/test_chunked_prefill_distributed.py index 6976dbf8ec3..516296f876f 100644 --- a/tests/distributed/test_chunked_prefill_distributed.py +++ b/tests/distributed/test_chunked_prefill_distributed.py @@ -7,6 +7,7 @@ """ import os + import pytest from vllm.utils import cuda_device_count_stateless diff --git a/tests/samplers/test_sampler.py b/tests/samplers/test_sampler.py index 565daf6ba78..b08a2bcbd23 100644 --- a/tests/samplers/test_sampler.py +++ b/tests/samplers/test_sampler.py @@ -1,8 +1,8 @@ import itertools import random +from array import array from typing import Dict, List, Optional, Tuple from unittest.mock import Mock, patch -from array import array import pytest import torch diff --git a/tests/spec_decode/utils.py b/tests/spec_decode/utils.py index 9e939996d0b..f598dd2165a 100644 --- a/tests/spec_decode/utils.py +++ b/tests/spec_decode/utils.py @@ -1,9 +1,9 @@ +from array import array from itertools import count from typing import Callable, Dict, List, Optional from typing import Sequence as GenericSequence from typing import TypeVar, Union from unittest.mock import MagicMock -from array import array import torch diff --git a/tests/test_logits_processor.py b/tests/test_logits_processor.py index 20ff59db7b8..bdaf5a8d159 100644 --- a/tests/test_logits_processor.py +++ b/tests/test_logits_processor.py @@ -1,7 +1,7 @@ import random +from array import array from typing import Tuple from unittest.mock import patch -from array import array import pytest import torch diff --git a/tests/test_sequence.py b/tests/test_sequence.py index 19e02c30798..b780f5673c1 100644 --- a/tests/test_sequence.py +++ b/tests/test_sequence.py @@ -1,6 +1,7 @@ +from array import array + import pytest -from array import array from vllm.sequence import (CompletionSequenceGroupOutput, SamplerOutput, SequenceData, SequenceOutput) diff --git a/tests/worker/test_encoder_decoder_model_runner.py b/tests/worker/test_encoder_decoder_model_runner.py index 465cd4359ff..05d7161a0e9 100644 --- a/tests/worker/test_encoder_decoder_model_runner.py +++ b/tests/worker/test_encoder_decoder_model_runner.py @@ -1,8 +1,8 @@ +from array import array from typing import List import pytest import torch -from array import array from vllm.engine.arg_utils import EngineArgs from vllm.sequence import SamplingParams, SequenceData, SequenceGroupMetadata diff --git a/tests/worker/test_model_runner.py b/tests/worker/test_model_runner.py index 62053874c64..23f8febb0ad 100644 --- a/tests/worker/test_model_runner.py +++ b/tests/worker/test_model_runner.py @@ -1,8 +1,8 @@ +from array import array from typing import List import pytest import torch -from array import array from vllm.distributed.parallel_state import (ensure_model_parallel_initialized, init_distributed_environment) diff --git a/vllm/executor/msgspec_utils.py b/vllm/executor/msgspec_utils.py index c4a6844303a..05221b35f9c 100644 --- a/vllm/executor/msgspec_utils.py +++ b/vllm/executor/msgspec_utils.py @@ -1,5 +1,6 @@ from array import array from typing import Any, Type + from vllm.sequence import SequenceData diff --git a/vllm/model_executor/models/paligemma.py b/vllm/model_executor/models/paligemma.py index 85a3fc3f015..9ba53b8b59a 100644 --- a/vllm/model_executor/models/paligemma.py +++ b/vllm/model_executor/models/paligemma.py @@ -1,4 +1,3 @@ -from array import array from typing import Iterable, List, Literal, Optional, Tuple, TypedDict import torch @@ -39,39 +38,6 @@ def get_max_paligemma_image_tokens(ctx: InputContext): return get_max_siglip_image_tokens(vision_config) -def dummy_seq_data_for_paligemma( - hf_config: PaliGemmaConfig, - seq_len: int, - *, - image_token_id: int, - image_feature_size_override: Optional[int] = None, -): - if image_feature_size_override is None: - image_feature_size = hf_config.text_config.num_image_tokens - else: - image_feature_size = image_feature_size_override - - token_ids = array("I", [image_token_id]) * image_feature_size - token_ids += array("I", [0]) * (seq_len - image_feature_size) - return SequenceData(token_ids) - - -def dummy_image_for_paligemma( - hf_config: SiglipVisionConfig, - *, - image_width_override: Optional[int] = None, - image_height_override: Optional[int] = None, -): - width = height = hf_config.image_size - if image_width_override is not None: - width = image_width_override - if image_height_override is not None: - height = image_height_override - - image = Image.new("RGB", (width, height), color=0) - return {"image": image} - - def dummy_data_for_paligemma(ctx: InputContext, seq_len: int): hf_config = ctx.get_hf_config(PaliGemmaConfig) vision_config = hf_config.vision_config diff --git a/vllm/model_executor/models/siglip.py b/vllm/model_executor/models/siglip.py index 5ba14f73394..6380014957d 100644 --- a/vllm/model_executor/models/siglip.py +++ b/vllm/model_executor/models/siglip.py @@ -2,6 +2,7 @@ within a vision language model.""" import math +from array import array from typing import Iterable, Optional, Tuple import torch @@ -63,7 +64,7 @@ def dummy_seq_data_for_siglip( token_ids = [image_token_id] * image_feature_size token_ids += [0] * (seq_len - image_feature_size) - return SequenceData(token_ids) + return SequenceData(array("I", token_ids)) def dummy_image_for_siglip( From d2e14ca3de4117a9b71f8135fce90bf0f4dcd3b2 Mon Sep 17 00:00:00 2001 From: sang Date: Sun, 11 Aug 2024 22:47:15 -0700 Subject: [PATCH 26/36] code review. --- vllm/core/scheduler.py | 2 +- vllm/sequence.py | 4 ++-- vllm/worker/worker.py | 2 +- 3 files changed, 4 insertions(+), 4 deletions(-) diff --git a/vllm/core/scheduler.py b/vllm/core/scheduler.py index 2f39f5e00fe..e042f016ff5 100644 --- a/vllm/core/scheduler.py +++ b/vllm/core/scheduler.py @@ -1120,7 +1120,7 @@ def schedule(self) -> Tuple[List[SequenceGroupMetadata], SchedulerOutputs]: # the first request to reduce serialization cost. seq_data_delta = {} for id, data in seq_data.items(): - seq_data_delta[id] = data.reset_and_get_delta() + seq_data_delta[id] = data.get_delta_and_reset() seq_group_metadata = SequenceGroupMetadataDelta( seq_data_delta, seq_group.request_id, diff --git a/vllm/sequence.py b/vllm/sequence.py index 5f020a92665..43ced6cc66f 100644 --- a/vllm/sequence.py +++ b/vllm/sequence.py @@ -147,7 +147,7 @@ class SequenceData(msgspec.Struct, omit_defaults=True): _stage: SequenceStage = SequenceStage.PREFILL _cached_all_token_ids: List[int] = msgspec.field(default_factory=list) - # It is used to get delta input. It is reset when `reset_and_get_delta` + # It is used to get delta input. It is reset when `get_delta_and_reset` # is called. _new_appended_tokens: List[int] = msgspec.field(default_factory=list) @@ -271,7 +271,7 @@ def get_prompt_token_ids(self) -> Tuple[int, ...]: def get_output_token_ids(self) -> Tuple[int, ...]: return self.output_token_ids - def reset_and_get_delta(self) -> SequenceDataDelta: + def get_delta_and_reset(self) -> SequenceDataDelta: delta = SequenceDataDelta(self._new_appended_tokens, self._cumulative_logprob, self.get_num_computed_tokens(), self.stage) diff --git a/vllm/worker/worker.py b/vllm/worker/worker.py index 4650b9585fa..7c7618f5638 100644 --- a/vllm/worker/worker.py +++ b/vllm/worker/worker.py @@ -332,7 +332,7 @@ def _get_cached_seq_group_metadata( self._seq_group_metadata_cache[request_id].apply_delta( metadata_or_delta) else: - # If metadata snapshot is sent again, it is either + # If metadata snapshot is sent again, it is # preempted. Reset the cache because we need to start # from scratch. assert isinstance(metadata_or_delta, SequenceGroupMetadata) From 8be3c8e7721db422f51a1830f241443833bf0beb Mon Sep 17 00:00:00 2001 From: sang Date: Mon, 12 Aug 2024 17:59:32 -0700 Subject: [PATCH 27/36] addressed code review. --- a.py | 4 ---- tests/core/test_serialization.py | 15 +++++++++++++-- vllm/engine/arg_utils.py | 1 + vllm/envs.py | 5 +++++ vllm/executor/msgspec_utils.py | 10 ---------- vllm/model_executor/layers/sampler.py | 3 +++ vllm/sampling_params.py | 3 +++ vllm/sequence.py | 10 +++++++--- 8 files changed, 32 insertions(+), 19 deletions(-) delete mode 100644 a.py diff --git a/a.py b/a.py deleted file mode 100644 index 846b9239a45..00000000000 --- a/a.py +++ /dev/null @@ -1,4 +0,0 @@ -from vllm.prompt_adapter.request import PromptAdapterRequest - -r = PromptAdapterRequest("a", 1, "a", 1) -r.__hash__() \ No newline at end of file diff --git a/tests/core/test_serialization.py b/tests/core/test_serialization.py index 1a07fd4fee9..d604e5250a3 100644 --- a/tests/core/test_serialization.py +++ b/tests/core/test_serialization.py @@ -18,5 +18,16 @@ def test_msgspec_serialization(): decoder = msgspec.msgpack.Decoder(ExecuteModelRequest, dec_hook=decode_hook) req = decoder.decode(encoder.encode(execute_model_req)) - assert (len(req.seq_group_metadata_list) == len( - execute_model_req.seq_group_metadata_list)) + expected = execute_model_req.seq_group_metadata_list + actual = req.seq_group_metadata_list + assert (len(expected) == len(actual)) + expected = expected[0] + actual = actual[0] + + assert expected.block_tables == actual.block_tables + assert expected.is_prompt == actual.is_prompt + assert expected.request_id == actual.request_id + assert (expected.seq_data[0].prompt_token_ids == + actual.seq_data[0].prompt_token_ids) + assert (expected.seq_data[0].output_token_ids == + actual.seq_data[0].output_token_ids) diff --git a/vllm/engine/arg_utils.py b/vllm/engine/arg_utils.py index 54ab2032f83..1ed1a51ec1c 100644 --- a/vllm/engine/arg_utils.py +++ b/vllm/engine/arg_utils.py @@ -834,6 +834,7 @@ def create_engine_config(self, ) -> EngineConfig: embedding_mode=model_config.embedding_mode, preemption_mode=self.preemption_mode, _send_delta_data=(envs.VLLM_USE_RAY_SPMD_WORKER + and envs.VLLM_SPMD_SEND_DELTA_DATA and parallel_config.use_ray)) lora_config = LoRAConfig( max_lora_rank=self.max_lora_rank, diff --git a/vllm/envs.py b/vllm/envs.py index 26d0c33707f..9bcee5b4339 100644 --- a/vllm/envs.py +++ b/vllm/envs.py @@ -39,6 +39,7 @@ VLLM_XLA_CACHE_PATH: str = os.path.join(VLLM_CACHE_ROOT, "xla_cache") VLLM_FUSED_MOE_CHUNK_SIZE: int = 64 * 1024 VLLM_USE_RAY_SPMD_WORKER: bool = False + VLLM_SPMD_SEND_DELTA_DATA: bool = False VLLM_USE_RAY_COMPILED_DAG: bool = False VLLM_USE_RAY_COMPILED_DAG_NCCL_CHANNEL: bool = True VLLM_WORKER_MULTIPROC_METHOD: str = "fork" @@ -290,6 +291,10 @@ def get_default_config_root(): "VLLM_USE_RAY_SPMD_WORKER": lambda: bool(int(os.getenv("VLLM_USE_RAY_SPMD_WORKER", "0"))), + # If set, it sends delta data for SPMD workers. + "VLLM_SPMD_SEND_DELTA_DATA": + lambda: bool(int(os.getenv("VLLM_SPMD_SEND_DELTA_DATA", "1"))), + # If the env var is set, it uses the Ray's compiled DAG API # which optimizes the control plane overhead. # Run vLLM with VLLM_USE_RAY_COMPILED_DAG=1 to enable it. diff --git a/vllm/executor/msgspec_utils.py b/vllm/executor/msgspec_utils.py index 05221b35f9c..cb634c6f74e 100644 --- a/vllm/executor/msgspec_utils.py +++ b/vllm/executor/msgspec_utils.py @@ -1,8 +1,6 @@ from array import array from typing import Any, Type -from vllm.sequence import SequenceData - def encode_hook(obj: Any) -> Any: """Custom msgspec enc hook that supports array types. @@ -11,11 +9,6 @@ def encode_hook(obj: Any) -> Any: """ if isinstance(obj, array): return obj.tobytes() - if isinstance(obj, SequenceData): - # This can be reconstructed from __post_init__. - obj._prompt_token_ids_tuple = tuple() - obj._cached_all_token_ids = [] - obj._new_appended_tokens = [] else: raise ValueError(f"Unsupported serialization type: {type(obj)}") @@ -29,8 +22,5 @@ def decode_hook(type: Type, obj: Any) -> Any: deserialized = array('I') deserialized.frombytes(obj) return deserialized - if isinstance(obj, SequenceData): - obj.__post_init__() - return obj else: raise ValueError(f"Unsupported deserialization type: {type(obj)}") diff --git a/vllm/model_executor/layers/sampler.py b/vllm/model_executor/layers/sampler.py index cc78a0ea3b8..ebb5f9b13ae 100644 --- a/vllm/model_executor/layers/sampler.py +++ b/vllm/model_executor/layers/sampler.py @@ -794,6 +794,7 @@ def _get_logprobs( # Update indices and tokens for prompt logprobs. if (seq_group.is_prompt and sampling_params.prompt_logprobs is not None): + print(f"SANG-TODO {sampling_params.prompt_logprobs=}") largest_num_logprobs = max(largest_num_logprobs, sampling_params.prompt_logprobs) next_prompt_tokens = _get_next_prompt_tokens(seq_group) @@ -813,6 +814,7 @@ def _get_logprobs( next_token_ids.extend(token_ids) if sampling_params.logprobs is not None: + print(f"SANG-TODO {sampling_params.logprobs=}") largest_num_logprobs = max(largest_num_logprobs, sampling_params.logprobs) @@ -851,6 +853,7 @@ def _get_logprobs( if largest_num_logprobs > 0: # Logprobs of topk tokens for a batch of sequence groups. # (num_query_tokens_across_batch). + print(f"SANG-TODO {largest_num_logprobs=}") top_logprobs, top_token_ids = torch.topk(logprobs, largest_num_logprobs, dim=-1) diff --git a/vllm/sampling_params.py b/vllm/sampling_params.py index 5a1780da299..52a98914903 100644 --- a/vllm/sampling_params.py +++ b/vllm/sampling_params.py @@ -163,6 +163,9 @@ def __post_init__(self) -> None: self.stop_token_ids = [] else: self.stop_token_ids = list(self.stop_token_ids) + self.logprobs = 1 if self.logprobs is True else self.logprobs + self.prompt_logprobs = (1 if self.prompt_logprobs is True else + self.prompt_logprobs) # Number of characters to hold back for stop string evaluation # until sequence is finished. diff --git a/vllm/sequence.py b/vllm/sequence.py index 43ced6cc66f..dd119dfe50b 100644 --- a/vllm/sequence.py +++ b/vllm/sequence.py @@ -112,10 +112,14 @@ class RequestMetrics: class SequenceDataDelta(msgspec.Struct, array_like=True, omit_defaults=True): - """Delta SequenceGroupData to send to workers.""" + """Delta SequenceData to send to workers per step.""" + # A new token to be appended to existing SequenceData. new_output_token_ids: List[int] + # Overwriting existing `cumulative_logprob` new_cumulative_logprob: float + # Overwriting existing `num_computed_tokens`. new_num_computed_tokens: int + # Overwriting existing `stage`. new_stage: SequenceStage @@ -151,7 +155,7 @@ class SequenceData(msgspec.Struct, omit_defaults=True): # is called. _new_appended_tokens: List[int] = msgspec.field(default_factory=list) - def __post_init__(self, ) -> None: + def __post_init__(self) -> None: self._prompt_token_ids_tuple: Tuple[int, ...] = tuple( self._prompt_token_ids) self._update_cached_all_tokens() @@ -892,7 +896,7 @@ def apply_delta(self, sequence_group_metadata_delta: SequenceGroupMetadataDelta): for id, delta in sequence_group_metadata_delta.seq_data_delta.items(): self.seq_data[id].apply_delta(delta) - self.request_id = sequence_group_metadata_delta.request_id + assert self.request_id == sequence_group_metadata_delta.request_id self.block_tables = sequence_group_metadata_delta.block_tables self.token_chunk_size = sequence_group_metadata_delta.token_chunk_size self.do_sample = sequence_group_metadata_delta.do_sample From 2ba99e2cb05456d176de8076c1682f5aa4893c1e Mon Sep 17 00:00:00 2001 From: sang Date: Tue, 13 Aug 2024 11:44:50 -0700 Subject: [PATCH 28/36] lint fix --- vllm/lora/request.py | 5 ++- vllm/pooling_params.py | 5 ++- vllm/prompt_adapter/request.py | 9 +++-- vllm/sampling_params.py | 3 +- vllm/sequence.py | 70 +++++++++++++++++++++++----------- vllm/spec_decode/metrics.py | 7 ++-- 6 files changed, 67 insertions(+), 32 deletions(-) diff --git a/vllm/lora/request.py b/vllm/lora/request.py index 094e4b88f7c..d770da4f240 100644 --- a/vllm/lora/request.py +++ b/vllm/lora/request.py @@ -6,7 +6,10 @@ from vllm.adapter_commons.request import AdapterRequest -class LoRARequest(msgspec.Struct, omit_defaults=True, array_like=True): +class LoRARequest( + msgspec.Struct, + omit_defaults=True, # type: ignore[call-arg] + array_like=True): # type: ignore[call-arg] """ Request for a LoRA adapter. diff --git a/vllm/pooling_params.py b/vllm/pooling_params.py index 204adac889b..7461fb51989 100644 --- a/vllm/pooling_params.py +++ b/vllm/pooling_params.py @@ -3,7 +3,10 @@ import msgspec -class PoolingParams(msgspec.Struct, omit_defaults=True, array_like=True): +class PoolingParams( + msgspec.Struct, + omit_defaults=True, # type: ignore[call-arg] + array_like=True): # type: ignore[call-arg] """Pooling parameters for pooling. Attributes: diff --git a/vllm/prompt_adapter/request.py b/vllm/prompt_adapter/request.py index 26ab043bbea..775dd11db07 100644 --- a/vllm/prompt_adapter/request.py +++ b/vllm/prompt_adapter/request.py @@ -3,10 +3,11 @@ from vllm.adapter_commons.request import AdapterRequest -class PromptAdapterRequest(msgspec.Struct, - array_like=True, - omit_defaults=True, - frozen=True): +class PromptAdapterRequest( + msgspec.Struct, + array_like=True, # type: ignore[call-arg] + omit_defaults=True, # type: ignore[call-arg] + frozen=True): # type: ignore[call-arg] """ Request for a Prompt adapter. """ diff --git a/vllm/sampling_params.py b/vllm/sampling_params.py index 52a98914903..2d6e4489cf2 100644 --- a/vllm/sampling_params.py +++ b/vllm/sampling_params.py @@ -31,7 +31,8 @@ class SamplingType(IntEnum): to sample from.""" -class SamplingParams(msgspec.Struct, omit_defaults=True): +class SamplingParams(msgspec.Struct, + omit_defaults=True): # type: ignore[call-arg] """Sampling parameters for text generation. Overall, we follow the sampling parameters from the OpenAI text completion diff --git a/vllm/sequence.py b/vllm/sequence.py index dd119dfe50b..3517aedcaa6 100644 --- a/vllm/sequence.py +++ b/vllm/sequence.py @@ -23,7 +23,10 @@ from vllm.multimodal.base import MultiModalDataDict -class Logprob(msgspec.Struct, omit_defaults=True, array_like=True): +class Logprob( + msgspec.Struct, + omit_defaults=True, # type: ignore[call-arg] + array_like=True): # type: ignore[call-arg] """Infos for supporting OpenAI compatible logprobs and token ranks. Attributes: @@ -111,7 +114,10 @@ class RequestMetrics: model_execute_time: Optional[float] = None -class SequenceDataDelta(msgspec.Struct, array_like=True, omit_defaults=True): +class SequenceDataDelta( + msgspec.Struct, + array_like=True, # type: ignore[call-arg] + omit_defaults=True): # type: ignore[call-arg] """Delta SequenceData to send to workers per step.""" # A new token to be appended to existing SequenceData. new_output_token_ids: List[int] @@ -123,7 +129,8 @@ class SequenceDataDelta(msgspec.Struct, array_like=True, omit_defaults=True): new_stage: SequenceStage -class SequenceData(msgspec.Struct, omit_defaults=True): +class SequenceData(msgspec.Struct, + omit_defaults=True): # type: ignore[call-arg] """Data associated with a sequence. Args: @@ -794,10 +801,11 @@ def __repr__(self) -> str: f"num_seqs={len(self.seqs)})") -class SequenceGroupMetadataDelta(msgspec.Struct, - tag=True, - array_like=True, - omit_defaults=True): +class SequenceGroupMetadataDelta( + msgspec.Struct, + tag=True, # type: ignore[call-arg] + array_like=True, # type: ignore[call-arg] + omit_defaults=True): # type: ignore[call-arg] """Delta of SequenceGroupMetadata. After sending the first SequenceGroupMetadata, vLLM scheduler @@ -812,10 +820,11 @@ class SequenceGroupMetadataDelta(msgspec.Struct, computed_block_nums: Optional[List[int]] = None -class SequenceGroupMetadata(msgspec.Struct, - tag=True, - array_like=True, - omit_defaults=True): +class SequenceGroupMetadata( + msgspec.Struct, + tag=True, # type: ignore[call-arg] + array_like=True, # type: ignore[call-arg] + omit_defaults=True): # type: ignore[call-arg] """Metadata for a sequence group. Used to create `AttentionMetadata`. Args: @@ -903,7 +912,10 @@ def apply_delta(self, self.is_prompt = sequence_group_metadata_delta.is_prompt -class SequenceOutput(msgspec.Struct, omit_defaults=True, array_like=True): +class SequenceOutput( + msgspec.Struct, + omit_defaults=True, # type: ignore[call-arg] + array_like=True): # type: ignore[call-arg] """The model output associated with a sequence. Args: @@ -943,9 +955,10 @@ def __eq__(self, other: object) -> bool: pass -class CompletionSequenceGroupOutput(msgspec.Struct, - omit_defaults=True, - array_like=True): +class CompletionSequenceGroupOutput( + msgspec.Struct, + omit_defaults=True, # type: ignore[call-arg] + array_like=True): # type: ignore[call-arg] __metaclass__ = SequenceGroupOutput """The model output associated with a completion sequence group.""" samples: List[SequenceOutput] @@ -965,8 +978,8 @@ def __eq__(self, other: object) -> bool: class EmbeddingSequenceGroupOutput( msgspec.Struct, - omit_defaults=True, - array_like=True, + omit_defaults=True, # type: ignore[call-arg] + array_like=True, # type: ignore[call-arg] ): """The model output associated with an embedding sequence group.""" __metaclass__ = SequenceGroupOutput @@ -982,7 +995,10 @@ def __eq__(self, other: object) -> bool: return self.embeddings == other.embeddings -class IntermediateTensors(msgspec.Struct, omit_defaults=True, array_like=True): +class IntermediateTensors( + msgspec.Struct, + omit_defaults=True, # type: ignore[call-arg] + array_like=True): # type: ignore[call-arg] """For all pipeline stages except the last, we need to return the hidden states and residuals to be sent to the next stage. This data structure contains the hidden states and residuals for a request. @@ -1009,7 +1025,10 @@ def __repr__(self) -> str: return f"IntermediateTensors(tensors={self.tensors})" -class SamplerOutput(msgspec.Struct, omit_defaults=True, array_like=True): +class SamplerOutput( + msgspec.Struct, + omit_defaults=True, # type: ignore[call-arg] + array_like=True): # type: ignore[call-arg] """For each sequence group, we generate a list of SequenceOutput object, each of which contains one possible candidate for the next token. @@ -1068,7 +1087,10 @@ def __repr__(self) -> str: f"spec_decode_worker_metrics={self.spec_decode_worker_metrics})") -class PoolerOutput(msgspec.Struct, omit_defaults=True, array_like=True): +class PoolerOutput( + msgspec.Struct, + omit_defaults=True, # type: ignore[call-arg] + array_like=True): # type: ignore[call-arg] """The output from a pooling operation in the embedding model.""" outputs: List[EmbeddingSequenceGroupOutput] @@ -1111,7 +1133,8 @@ def get_all_seq_ids_and_request_ids( return seq_ids, request_id_seq_ids_mapping -class HiddenStates(msgspec.Struct, array_like=True, omit_defaults=True): +class HiddenStates(msgspec.Struct, array_like=True, + omit_defaults=True): # type: ignore[call-arg] """Hidden states corresponding to in-progress sequences. Used in speculative decoding to pass hidden states from the target model to the proposer model in the subsequent step. @@ -1149,7 +1172,10 @@ def prune(self, self._seq_ids = seq_ids -class ExecuteModelRequest(msgspec.Struct, array_like=True, omit_defaults=True): +class ExecuteModelRequest( + msgspec.Struct, + array_like=True, # type: ignore[call-arg] + omit_defaults=True): # type: ignore[call-arg] """The model execution request, containing CPU metadata only. The LLM engine should create an instance of this class for each request batch.""" # The sequence group metadata list. diff --git a/vllm/spec_decode/metrics.py b/vllm/spec_decode/metrics.py index b959bb9fd09..ad4e2dc879d 100644 --- a/vllm/spec_decode/metrics.py +++ b/vllm/spec_decode/metrics.py @@ -9,9 +9,10 @@ from vllm.utils import is_pin_memory_available -class SpecDecodeWorkerMetrics(msgspec.Struct, - omit_defaults=True, - array_like=True): +class SpecDecodeWorkerMetrics( + msgspec.Struct, + omit_defaults=True, # type: ignore[call-arg] + array_like=True): # type: ignore[call-arg] """Dataclass holding metrics emitted from the spec decode worker. """ From 925c928a46edbbe792ffbd1bbea1462fe15fee44 Mon Sep 17 00:00:00 2001 From: sang Date: Wed, 14 Aug 2024 15:14:29 -0700 Subject: [PATCH 29/36] fix lint --- vllm/model_executor/models/chameleon.py | 2 +- vllm/model_executor/models/fuyu.py | 3 ++- vllm/sampling_params.py | 6 +++--- vllm/sequence.py | 13 ++++++++++--- 4 files changed, 16 insertions(+), 8 deletions(-) diff --git a/vllm/model_executor/models/chameleon.py b/vllm/model_executor/models/chameleon.py index 3dc0223b27f..4cfa8802f75 100644 --- a/vllm/model_executor/models/chameleon.py +++ b/vllm/model_executor/models/chameleon.py @@ -71,7 +71,7 @@ def dummy_seq_data_for_chameleon( else: image_feature_size = image_feature_size_override - token_ids = array("I",[image_token_id]) * image_feature_size * num_images + token_ids = array("I", [image_token_id]) * image_feature_size * num_images token_ids += array("I", [0]) * (seq_len - image_feature_size * num_images) return SequenceData(token_ids) diff --git a/vllm/model_executor/models/fuyu.py b/vllm/model_executor/models/fuyu.py index 1b8e99d4aaf..71cd7511ccf 100644 --- a/vllm/model_executor/models/fuyu.py +++ b/vllm/model_executor/models/fuyu.py @@ -98,7 +98,8 @@ def dummy_seq_data_for_fuyu(ctx: InputContext, seq_len: int, num_images: int): ncol, nrow = get_max_fuyu_image_feature_size() image_feature_size = get_max_fuyu_image_tokens(ctx) - image_token_ids = (array("I", [_IMAGE_TOKEN_ID]) * ncol + array("I", [_NEWLINE_TOKEN_ID])) * nrow + image_token_ids = (array("I", [_IMAGE_TOKEN_ID]) * ncol + + array("I", [_NEWLINE_TOKEN_ID])) * nrow token_ids = array("I", image_token_ids) * num_images token_ids += array("I", [0]) * (seq_len - image_feature_size * num_images) return SequenceData(token_ids) diff --git a/vllm/sampling_params.py b/vllm/sampling_params.py index 0ba1cf0696d..1371e658c8b 100644 --- a/vllm/sampling_params.py +++ b/vllm/sampling_params.py @@ -151,12 +151,12 @@ class SamplingParams(msgspec.Struct, def __post_init__(self) -> None: self.best_of = self.best_of or self.n - if 0 < temperature < _MAX_TEMP: + if 0 < self.temperature < _MAX_TEMP: logger.warning( "temperature %s is less than %s, which may cause numerical " "errors nan or inf in tensors. We have maxed it out to %s.", - temperature, _MAX_TEMP, _MAX_TEMP) - temperature = max(temperature, _MAX_TEMP) + self.temperature, _MAX_TEMP, _MAX_TEMP) + self.temperature = max(self.temperature, _MAX_TEMP) if self.seed == -1: self.seed = None else: diff --git a/vllm/sequence.py b/vllm/sequence.py index 5c1bb9d2802..8e875bd4af6 100644 --- a/vllm/sequence.py +++ b/vllm/sequence.py @@ -548,8 +548,8 @@ def __repr__(self) -> str: f"num_blocks={self.n_blocks}, ") -@dataclass -class SequenceGroupState: +class SequenceGroupState(msgspec.Struct, + omits_default=True): # type: ignore[call-arg] """Mutable state tied to a specific sequence group""" # for multi-step decoding @@ -837,6 +837,8 @@ class SequenceGroupMetadataDelta( do_sample: bool = True token_chunk_size: Optional[int] = None computed_block_nums: Optional[List[int]] = None + state: Optional[SequenceGroupState] = msgspec.field( + default_factory=lambda: SequenceGroupState()) class SequenceGroupMetadata( @@ -934,6 +936,7 @@ def apply_delta(self, self.is_prompt = sequence_group_metadata_delta.is_prompt def finish_step(self) -> None: + assert self.state is not None assert self.state.current_step < self.state.num_steps self.state.current_step += 1 @@ -1237,6 +1240,7 @@ def is_first_multi_step(self) -> bool: # steps assert len(self.seq_group_metadata_list) > 0 first_seq_group = self.seq_group_metadata_list[0] + assert first_seq_group.state is not None return first_seq_group.state.current_step == 0 @property @@ -1245,6 +1249,7 @@ def is_last_step(self) -> bool: # steps assert len(self.seq_group_metadata_list) > 0 first_seq_group = self.seq_group_metadata_list[0] + assert first_seq_group.state is not None num_steps = first_seq_group.state.num_steps current_step = first_seq_group.state.current_step return num_steps - current_step == 1 @@ -1254,7 +1259,9 @@ def current_step(self) -> int: # TODO(will) make this be able to handle batches with variable number of # steps assert len(self.seq_group_metadata_list) > 0 - return self.seq_group_metadata_list[0].state.current_step + state = self.seq_group_metadata_list[0].state + assert state is not None + return state.current_step def clone( self, seq_group_metadata_list: List[Union[SequenceGroupMetadata, From d041e9cb122ea583b2831a5eae7e73aaaaba2a29 Mon Sep 17 00:00:00 2001 From: sang Date: Thu, 15 Aug 2024 15:56:41 -0700 Subject: [PATCH 30/36] Addressed code review. --- tests/distributed/test_basic_distributed_correctness.py | 6 +++--- tests/distributed/test_chunked_prefill_distributed.py | 6 +++--- vllm/config.py | 6 +++--- vllm/core/scheduler.py | 2 +- vllm/engine/arg_utils.py | 6 +++--- vllm/sequence.py | 2 +- 6 files changed, 14 insertions(+), 14 deletions(-) diff --git a/tests/distributed/test_basic_distributed_correctness.py b/tests/distributed/test_basic_distributed_correctness.py index f6bf3bb60d6..a5fb495f4c1 100644 --- a/tests/distributed/test_basic_distributed_correctness.py +++ b/tests/distributed/test_basic_distributed_correctness.py @@ -23,7 +23,7 @@ reason="Need at least 2 GPUs to run the test.") @pytest.mark.parametrize( "model, distributed_executor_backend, attention_backend, " - "test_suite, enable_adag", [ + "test_suite, enable_spmd", [ ("facebook/opt-125m", "ray", "", "L4", False), ("facebook/opt-125m", "ray", "", "L4", True), ("facebook/opt-125m", "mp", "", "L4", False), @@ -43,13 +43,13 @@ def test_models( distributed_executor_backend: str, attention_backend: str, test_suite: str, - enable_adag: bool, + enable_spmd: bool, ) -> None: if test_suite != TARGET_TEST_SUITE: pytest.skip(f"Skip test for {test_suite}") - if enable_adag: + if enable_spmd: # test ray adag os.environ['VLLM_USE_RAY_SPMD_WORKER'] = "1" os.environ['VLLM_USE_RAY_COMPILED_DAG'] = "1" diff --git a/tests/distributed/test_chunked_prefill_distributed.py b/tests/distributed/test_chunked_prefill_distributed.py index 516296f876f..a248bd4d6b1 100644 --- a/tests/distributed/test_chunked_prefill_distributed.py +++ b/tests/distributed/test_chunked_prefill_distributed.py @@ -18,7 +18,7 @@ @pytest.mark.skipif(cuda_device_count_stateless() < 2, reason="Need at least 2 GPUs to run the test.") -@pytest.mark.parametrize("model, distributed_executor_backend, enable_adag", [ +@pytest.mark.parametrize("model, distributed_executor_backend, enable_spmd", [ ("facebook/opt-125m", "ray", False), ("facebook/opt-125m", "ray", True), ("meta-llama/Llama-2-7b-hf", "ray", False), @@ -32,9 +32,9 @@ def test_models( example_prompts, model: str, distributed_executor_backend: str, - enable_adag: bool, + enable_spmd: bool, ) -> None: - if enable_adag: + if enable_spmd: assert distributed_executor_backend == "ray" # test ray adag os.environ['VLLM_USE_RAY_SPMD_WORKER'] = "1" diff --git a/vllm/config.py b/vllm/config.py index 27fc140f565..761868b643f 100644 --- a/vllm/config.py +++ b/vllm/config.py @@ -836,7 +836,7 @@ class SchedulerConfig: swapping. However, when the sequence group has multiple sequences (e.g., beam search), recomputation is not currently supported. In such a case, we use swapping instead. - _send_delta_data: Private API. If used, scheduler sends delta data to + send_delta_data: Private API. If used, scheduler sends delta data to workers instead of an entire data. It should be enabled only when SPMD worker architecture is enabled. I.e., VLLM_USE_RAY_SPMD_WORKER=1 @@ -854,7 +854,7 @@ def __init__(self, embedding_mode: Optional[bool] = False, preemption_mode: Optional[str] = None, num_scheduler_steps: int = 1, - _send_delta_data: bool = False) -> None: + send_delta_data: bool = False) -> None: if max_num_batched_tokens is not None: self.max_num_batched_tokens = max_num_batched_tokens else: @@ -884,7 +884,7 @@ def __init__(self, self.embedding_mode = embedding_mode self.preemption_mode = preemption_mode self.num_scheduler_steps = num_scheduler_steps - self._send_delta_data = _send_delta_data + self.send_delta_data = send_delta_data self._verify_args() def _verify_args(self) -> None: diff --git a/vllm/core/scheduler.py b/vllm/core/scheduler.py index 5e951a228ef..49382e10adb 100644 --- a/vllm/core/scheduler.py +++ b/vllm/core/scheduler.py @@ -1096,7 +1096,7 @@ def schedule(self) -> Tuple[List[SequenceGroupMetadata], SchedulerOutputs]: # It assumes the scheduled_seq_groups is ordered by # prefill < decoding. - if is_first_prefill or not self.scheduler_config._send_delta_data: + if is_first_prefill or not self.scheduler_config.send_delta_data: seq_group_metadata = SequenceGroupMetadata( request_id=seq_group.request_id, is_prompt=is_prompt, diff --git a/vllm/engine/arg_utils.py b/vllm/engine/arg_utils.py index 19b104c48b6..360d2ff3538 100644 --- a/vllm/engine/arg_utils.py +++ b/vllm/engine/arg_utils.py @@ -893,9 +893,9 @@ def create_engine_config(self, ) -> EngineConfig: embedding_mode=model_config.embedding_mode, preemption_mode=self.preemption_mode, num_scheduler_steps=self.num_scheduler_steps, - _send_delta_data=(envs.VLLM_USE_RAY_SPMD_WORKER - and envs.VLLM_SPMD_SEND_DELTA_DATA - and parallel_config.use_ray), + send_delta_data=(envs.VLLM_USE_RAY_SPMD_WORKER + and envs.VLLM_SPMD_SEND_DELTA_DATA + and parallel_config.use_ray), ) lora_config = LoRAConfig( max_lora_rank=self.max_lora_rank, diff --git a/vllm/sequence.py b/vllm/sequence.py index 8e875bd4af6..f27a9a35c12 100644 --- a/vllm/sequence.py +++ b/vllm/sequence.py @@ -549,7 +549,7 @@ def __repr__(self) -> str: class SequenceGroupState(msgspec.Struct, - omits_default=True): # type: ignore[call-arg] + omit_defaults=True): # type: ignore[call-arg] """Mutable state tied to a specific sequence group""" # for multi-step decoding From f938e00618fe34900b5118d90631edcfe447210c Mon Sep 17 00:00:00 2001 From: sang Date: Fri, 16 Aug 2024 22:53:37 -0700 Subject: [PATCH 31/36] fix pydantic not compatible to msggspec.Struct. --- vllm/entrypoints/openai/protocol.py | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/vllm/entrypoints/openai/protocol.py b/vllm/entrypoints/openai/protocol.py index aef42e9425e..6aad0ed6929 100644 --- a/vllm/entrypoints/openai/protocol.py +++ b/vllm/entrypoints/openai/protocol.py @@ -526,6 +526,9 @@ class CompletionLogProbs(OpenAIBaseModel): class CompletionResponseChoice(OpenAIBaseModel): + # For Logprob because it is msgspec.Struct + model_config = ConfigDict(arbitrary_types_allowed=True) + index: int text: str logprobs: Optional[CompletionLogProbs] = None @@ -627,6 +630,9 @@ class ChatCompletionResponseChoice(OpenAIBaseModel): class ChatCompletionResponse(OpenAIBaseModel): + # For Logprob because it is msgspec.Struct + model_config = ConfigDict(arbitrary_types_allowed=True) + id: str = Field(default_factory=lambda: f"chatcmpl-{random_uuid()}") object: Literal["chat.completion"] = "chat.completion" created: int = Field(default_factory=lambda: int(time.time())) From 32cb9849319a1637e63817c80f8334f411af7d68 Mon Sep 17 00:00:00 2001 From: sang Date: Sat, 17 Aug 2024 00:10:18 -0700 Subject: [PATCH 32/36] addressed --- .../test_basic_distributed_correctness.py | 23 +++++++++--------- .../test_chunked_prefill_distributed.py | 15 ++++++------ tests/samplers/test_sampler.py | 24 ++++++++++++++----- tests/spec_decode/utils.py | 8 ++++--- tests/test_logits_processor.py | 7 ++++-- tests/test_sequence.py | 5 ++-- .../test_encoder_decoder_model_runner.py | 15 ++++++++---- tests/worker/test_model_runner.py | 14 +++++++---- vllm/engine/arg_utils.py | 1 - vllm/envs.py | 5 ---- vllm/executor/msgspec_utils.py | 5 +++- vllm/inputs/registry.py | 7 +++++- vllm/model_executor/layers/sampler.py | 3 --- vllm/model_executor/models/blip.py | 8 ++++--- vllm/model_executor/models/blip2.py | 9 ++++--- vllm/model_executor/models/chameleon.py | 9 ++++--- vllm/model_executor/models/clip.py | 8 ++++--- vllm/model_executor/models/fuyu.py | 13 ++++++---- vllm/model_executor/models/minicpmv.py | 5 ++-- vllm/model_executor/models/siglip.py | 8 ++++--- vllm/model_executor/sampling_metadata.py | 9 ++++--- vllm/sequence.py | 12 +++++++--- vllm/spec_decode/batch_expansion.py | 8 ++++--- 23 files changed, 136 insertions(+), 85 deletions(-) diff --git a/tests/distributed/test_basic_distributed_correctness.py b/tests/distributed/test_basic_distributed_correctness.py index a5fb495f4c1..7f373a01aa5 100644 --- a/tests/distributed/test_basic_distributed_correctness.py +++ b/tests/distributed/test_basic_distributed_correctness.py @@ -23,16 +23,16 @@ reason="Need at least 2 GPUs to run the test.") @pytest.mark.parametrize( "model, distributed_executor_backend, attention_backend, " - "test_suite, enable_spmd", [ - ("facebook/opt-125m", "ray", "", "L4", False), - ("facebook/opt-125m", "ray", "", "L4", True), - ("facebook/opt-125m", "mp", "", "L4", False), - ("meta-llama/Llama-2-7b-hf", "ray", "", "L4", False), - ("meta-llama/Llama-2-7b-hf", "mp", "", "L4", False), - ("facebook/opt-125m", "ray", "", "A100", False), - ("facebook/opt-125m", "mp", "", "A100", False), - ("facebook/opt-125m", "mp", "FLASHINFER", "A100", False), - ("meta-llama/Meta-Llama-3-8B", "ray", "FLASHINFER", "A100", False), + "test_suite", [ + ("facebook/opt-125m", "ray", "", "L4"), + ("facebook/opt-125m", "ray", "", "L4"), + ("facebook/opt-125m", "mp", "", "L4"), + ("meta-llama/Llama-2-7b-hf", "ray", "", "L4"), + ("meta-llama/Llama-2-7b-hf", "mp", "", "L4"), + ("facebook/opt-125m", "ray", "", "A100"), + ("facebook/opt-125m", "mp", "", "A100"), + ("facebook/opt-125m", "mp", "FLASHINFER", "A100"), + ("meta-llama/Meta-Llama-3-8B", "ray", "FLASHINFER", "A100"), ]) @fork_new_process_for_each_test def test_models( @@ -43,13 +43,12 @@ def test_models( distributed_executor_backend: str, attention_backend: str, test_suite: str, - enable_spmd: bool, ) -> None: if test_suite != TARGET_TEST_SUITE: pytest.skip(f"Skip test for {test_suite}") - if enable_spmd: + if model == "meta-llama/Llama-2-7b-hf" and distributed_executor_backend == "ray" and attention_backend == "" and test_suite == "L4": # noqa # test ray adag os.environ['VLLM_USE_RAY_SPMD_WORKER'] = "1" os.environ['VLLM_USE_RAY_COMPILED_DAG'] = "1" diff --git a/tests/distributed/test_chunked_prefill_distributed.py b/tests/distributed/test_chunked_prefill_distributed.py index a248bd4d6b1..83601a81d2d 100644 --- a/tests/distributed/test_chunked_prefill_distributed.py +++ b/tests/distributed/test_chunked_prefill_distributed.py @@ -18,12 +18,12 @@ @pytest.mark.skipif(cuda_device_count_stateless() < 2, reason="Need at least 2 GPUs to run the test.") -@pytest.mark.parametrize("model, distributed_executor_backend, enable_spmd", [ - ("facebook/opt-125m", "ray", False), - ("facebook/opt-125m", "ray", True), - ("meta-llama/Llama-2-7b-hf", "ray", False), - ("facebook/opt-125m", "mp", False), - ("meta-llama/Llama-2-7b-hf", "mp", False), +@pytest.mark.parametrize("model, distributed_executor_backend", [ + ("facebook/opt-125m", "ray"), + ("facebook/opt-125m", "ray"), + ("meta-llama/Llama-2-7b-hf", "ray"), + ("facebook/opt-125m", "mp"), + ("meta-llama/Llama-2-7b-hf", "mp"), ]) @fork_new_process_for_each_test def test_models( @@ -32,9 +32,8 @@ def test_models( example_prompts, model: str, distributed_executor_backend: str, - enable_spmd: bool, ) -> None: - if enable_spmd: + if model == "meta-llama/Llama-2-7b-hf" and distributed_executor_backend == "ray": # noqa assert distributed_executor_backend == "ray" # test ray adag os.environ['VLLM_USE_RAY_SPMD_WORKER'] = "1" diff --git a/tests/samplers/test_sampler.py b/tests/samplers/test_sampler.py index d5c35c28adf..820fb554888 100644 --- a/tests/samplers/test_sampler.py +++ b/tests/samplers/test_sampler.py @@ -11,7 +11,8 @@ from vllm.model_executor.layers.sampler import Sampler from vllm.model_executor.sampling_metadata import SamplingMetadata from vllm.model_executor.utils import set_random_seed -from vllm.sequence import SamplingParams, SequenceData, SequenceGroupMetadata +from vllm.sequence import (VLLM_TOKEN_ID_ARRAY_TYPE, SamplingParams, + SequenceData, SequenceGroupMetadata) from vllm.utils import Counter, is_pin_memory_available @@ -57,7 +58,9 @@ def _do_sample( SequenceGroupMetadata( request_id=f"test_{i}", is_prompt=True, - seq_data={0: SequenceData(array("I", [1, 2, 3]))}, + seq_data={ + 0: SequenceData(array(VLLM_TOKEN_ID_ARRAY_TYPE, [1, 2, 3])) + }, sampling_params=sampling_params, block_tables={0: [1]}, )) @@ -202,7 +205,8 @@ def create_sampling_params(min_tokens, def create_sequence_data(num_input=3, num_generated=0): seq_data = SequenceData( - array("I", random.choices(range(0, VOCAB_SIZE), k=num_input))) + array(VLLM_TOKEN_ID_ARRAY_TYPE, + random.choices(range(0, VOCAB_SIZE), k=num_input))) if num_generated > 0: seq_data.output_token_ids = random.choices(range(0, VOCAB_SIZE), k=num_generated) @@ -505,7 +509,9 @@ def test_sampler_mixed(seed: int, device: str): SequenceGroupMetadata( request_id=f"test_{i}", is_prompt=True, - seq_data={0: SequenceData(array("I", [1, 2, 3]))}, + seq_data={ + 0: SequenceData(array(VLLM_TOKEN_ID_ARRAY_TYPE, [1, 2, 3])) + }, sampling_params=sampling_params, block_tables={0: [1]}, )) @@ -601,7 +607,9 @@ def test_sampler_top_k_top_p(seed: int, device: str): SequenceGroupMetadata( request_id=f"test_{i}", is_prompt=True, - seq_data={0: SequenceData(array("I", [1, 2, 3]))}, + seq_data={ + 0: SequenceData(array(VLLM_TOKEN_ID_ARRAY_TYPE, [1, 2, 3])) + }, sampling_params=SamplingParams( temperature=1, top_k=top_k, @@ -651,7 +659,11 @@ def test_sampling_params(sampling_params: List[SamplingParams]): SequenceGroupMetadata( request_id=f"test_{i}", is_prompt=True, - seq_data={0: SequenceData(array("I", [1, 2, 3]))}, + seq_data={ + 0: + SequenceData(array(VLLM_TOKEN_ID_ARRAY_TYPE, + [1, 2, 3])) + }, sampling_params=sampling_params[i], block_tables={0: [1]}, )) diff --git a/tests/spec_decode/utils.py b/tests/spec_decode/utils.py index a4f5e87b0f7..60b36a33d90 100644 --- a/tests/spec_decode/utils.py +++ b/tests/spec_decode/utils.py @@ -10,7 +10,8 @@ from vllm.engine.arg_utils import EngineArgs from vllm.model_executor.utils import set_random_seed from vllm.sampling_params import SamplingParams -from vllm.sequence import (CompletionSequenceGroupOutput, Logprob, +from vllm.sequence import (VLLM_TOKEN_ID_ARRAY_TYPE, + CompletionSequenceGroupOutput, Logprob, SamplerOutput, SequenceData, SequenceGroupMetadata, SequenceOutput) from vllm.utils import get_distributed_init_method, get_ip, get_open_port @@ -139,8 +140,9 @@ def create_seq_group_metadata_from_prompts( seq_data={ i: SequenceData( - array("I", prompt_token_ids[:]), - _output_token_ids=array("I", cont_token_ids[:]), + array(VLLM_TOKEN_ID_ARRAY_TYPE, prompt_token_ids[:]), + _output_token_ids=array(VLLM_TOKEN_ID_ARRAY_TYPE, + cont_token_ids[:]), ), }, sampling_params=SamplingParams(temperature=0.0, ), diff --git a/tests/test_logits_processor.py b/tests/test_logits_processor.py index fb0f7aa6e77..1ce49a50688 100644 --- a/tests/test_logits_processor.py +++ b/tests/test_logits_processor.py @@ -9,7 +9,8 @@ from vllm.model_executor.layers.logits_processor import LogitsProcessor from vllm.model_executor.sampling_metadata import SamplingMetadata from vllm.model_executor.utils import set_random_seed -from vllm.sequence import SamplingParams, SequenceData, SequenceGroupMetadata +from vllm.sequence import (VLLM_TOKEN_ID_ARRAY_TYPE, SamplingParams, + SequenceData, SequenceGroupMetadata) from vllm.utils import is_pin_memory_available @@ -70,7 +71,9 @@ def pick_ith(token_ids, logits): SequenceGroupMetadata( request_id=f"test_{i}", is_prompt=True, - seq_data={0: SequenceData(array("I", [1, 2, 3]))}, + seq_data={ + 0: SequenceData(array(VLLM_TOKEN_ID_ARRAY_TYPE, [1, 2, 3])) + }, sampling_params=SamplingParams(temperature=0, logits_processors=[pick_ith]), block_tables={0: [1]}, diff --git a/tests/test_sequence.py b/tests/test_sequence.py index b780f5673c1..1ae349e808e 100644 --- a/tests/test_sequence.py +++ b/tests/test_sequence.py @@ -2,7 +2,8 @@ import pytest -from vllm.sequence import (CompletionSequenceGroupOutput, SamplerOutput, +from vllm.sequence import (VLLM_TOKEN_ID_ARRAY_TYPE, + CompletionSequenceGroupOutput, SamplerOutput, SequenceData, SequenceOutput) from .core.utils import create_dummy_prompt @@ -56,7 +57,7 @@ def test_sampler_output_eq(sample_outputs): def test_sequence_data_prefill(): - seq_data = SequenceData(array("I", [1, 2, 3, 4])) + seq_data = SequenceData(array(VLLM_TOKEN_ID_ARRAY_TYPE, [1, 2, 3, 4])) assert seq_data.get_num_uncomputed_tokens() == 4 assert seq_data.get_num_computed_tokens() == 0 # advance by 2 diff --git a/tests/worker/test_encoder_decoder_model_runner.py b/tests/worker/test_encoder_decoder_model_runner.py index 05d7161a0e9..32bff22f66a 100644 --- a/tests/worker/test_encoder_decoder_model_runner.py +++ b/tests/worker/test_encoder_decoder_model_runner.py @@ -5,7 +5,8 @@ import torch from vllm.engine.arg_utils import EngineArgs -from vllm.sequence import SamplingParams, SequenceData, SequenceGroupMetadata +from vllm.sequence import (VLLM_TOKEN_ID_ARRAY_TYPE, SamplingParams, + SequenceData, SequenceGroupMetadata) from vllm.utils import is_cpu from vllm.worker.enc_dec_model_runner import EncoderDecoderModelRunner @@ -126,10 +127,12 @@ def test_prepare_prompt( # make sure all tokens fit into one block seq_len = i % (model_runner.block_size - 1) + 1 seq_lens.append(seq_len) - seq_data = SequenceData(array("I", range(seq_len))) + seq_data = SequenceData(array(VLLM_TOKEN_ID_ARRAY_TYPE, + range(seq_len))) encoder_seq_len = (i + 1) % (model_runner.block_size - 1) + 1 encoder_seq_lens.append(encoder_seq_len) - encoder_seq_data = SequenceData(array("I", range(encoder_seq_len))) + encoder_seq_data = SequenceData( + array(VLLM_TOKEN_ID_ARRAY_TYPE, range(encoder_seq_len))) seq_group_metadata = SequenceGroupMetadata( request_id=f"test_{i}", is_prompt=True, @@ -320,10 +323,12 @@ def test_prepare_decode( # make sure all tokens fit into one block seq_len = i % (model_runner.block_size - 1) + 1 seq_lens.append(seq_len) - seq_data = SequenceData(array("I", (range(seq_len)))) + seq_data = SequenceData( + array(VLLM_TOKEN_ID_ARRAY_TYPE, (range(seq_len)))) encoder_seq_len = (i + 1) % (model_runner.block_size - 1) + 1 encoder_seq_lens.append(encoder_seq_len) - encoder_seq_data = SequenceData(array("I", (range(encoder_seq_len)))) + encoder_seq_data = SequenceData( + array(VLLM_TOKEN_ID_ARRAY_TYPE, (range(encoder_seq_len)))) seq_group_metadata = SequenceGroupMetadata( request_id=f"test_{i}", is_prompt=False, diff --git a/tests/worker/test_model_runner.py b/tests/worker/test_model_runner.py index 55db21e6a67..a20aa37bcc1 100644 --- a/tests/worker/test_model_runner.py +++ b/tests/worker/test_model_runner.py @@ -8,7 +8,8 @@ init_distributed_environment) from vllm.engine.arg_utils import EngineArgs from vllm.model_executor.sampling_metadata import SamplingMetadata -from vllm.sequence import SamplingParams, SequenceData, SequenceGroupMetadata +from vllm.sequence import (VLLM_TOKEN_ID_ARRAY_TYPE, SamplingParams, + SequenceData, SequenceGroupMetadata) from vllm.utils import get_open_port from vllm.worker.model_runner import ModelRunner, _get_graph_batch_size @@ -47,7 +48,8 @@ def test_prepare_prompt(batch_size): # make sure all tokens fit into one block seq_len = i % (model_runner.block_size - 1) + 1 seq_lens.append(seq_len) - seq_data = SequenceData(array("I", range(seq_len))) + seq_data = SequenceData(array(VLLM_TOKEN_ID_ARRAY_TYPE, + range(seq_len))) seq_group_metadata = SequenceGroupMetadata( request_id=f"test_{i}", is_prompt=True, @@ -164,7 +166,8 @@ def test_prepare_decode_cuda_graph(batch_size): # make sure all tokens fit into one block context_len = i % (model_runner.block_size - 1) + 1 context_lens.append(context_len) - seq_data = SequenceData(array("I", range(context_len))) + seq_data = SequenceData( + array(VLLM_TOKEN_ID_ARRAY_TYPE, range(context_len))) seq_data.update_num_computed_tokens(context_len) # Append one token ID since prefill is finished. seq_data.append_token_id(1, 0) @@ -325,7 +328,8 @@ def test_hybrid_batches(batch_size, enforce_eager, distributed_init): # make sure all tokens fit into one block seq_len = i % (model_runner.block_size - 1) + 1 seq_lens.append(seq_len) - seq_data = SequenceData(array("I", range(seq_len))) + seq_data = SequenceData(array(VLLM_TOKEN_ID_ARRAY_TYPE, + range(seq_len))) seq_group_metadata = SequenceGroupMetadata( request_id=f"test_{i}", is_prompt=True, @@ -341,7 +345,7 @@ def test_hybrid_batches(batch_size, enforce_eager, distributed_init): for i in range(prefill_batch_size, batch_size): # make sure all tokens fit into one block context_len = i % (model_runner.block_size - 1) + 1 - prompt_toks = array("I", range(context_len)) + prompt_toks = array(VLLM_TOKEN_ID_ARRAY_TYPE, range(context_len)) seq_data = SequenceData(prompt_toks) seq_data.append_token_id(1, 0) seq_data.update_num_computed_tokens(context_len) diff --git a/vllm/engine/arg_utils.py b/vllm/engine/arg_utils.py index de00751b432..f81347fab76 100644 --- a/vllm/engine/arg_utils.py +++ b/vllm/engine/arg_utils.py @@ -909,7 +909,6 @@ def create_engine_config(self, ) -> EngineConfig: preemption_mode=self.preemption_mode, num_scheduler_steps=self.num_scheduler_steps, send_delta_data=(envs.VLLM_USE_RAY_SPMD_WORKER - and envs.VLLM_SPMD_SEND_DELTA_DATA and parallel_config.use_ray), ) lora_config = LoRAConfig( diff --git a/vllm/envs.py b/vllm/envs.py index a49b917ed1e..b0cb56e58d0 100644 --- a/vllm/envs.py +++ b/vllm/envs.py @@ -39,7 +39,6 @@ VLLM_XLA_CACHE_PATH: str = os.path.join(VLLM_CACHE_ROOT, "xla_cache") VLLM_FUSED_MOE_CHUNK_SIZE: int = 64 * 1024 VLLM_USE_RAY_SPMD_WORKER: bool = False - VLLM_SPMD_SEND_DELTA_DATA: bool = False VLLM_USE_RAY_COMPILED_DAG: bool = False VLLM_USE_RAY_COMPILED_DAG_NCCL_CHANNEL: bool = True VLLM_WORKER_MULTIPROC_METHOD: str = "fork" @@ -294,10 +293,6 @@ def get_default_config_root(): "VLLM_USE_RAY_SPMD_WORKER": lambda: bool(int(os.getenv("VLLM_USE_RAY_SPMD_WORKER", "0"))), - # If set, it sends delta data for SPMD workers. - "VLLM_SPMD_SEND_DELTA_DATA": - lambda: bool(int(os.getenv("VLLM_SPMD_SEND_DELTA_DATA", "1"))), - # If the env var is set, it uses the Ray's compiled DAG API # which optimizes the control plane overhead. # Run vLLM with VLLM_USE_RAY_COMPILED_DAG=1 to enable it. diff --git a/vllm/executor/msgspec_utils.py b/vllm/executor/msgspec_utils.py index cb634c6f74e..7d07220206e 100644 --- a/vllm/executor/msgspec_utils.py +++ b/vllm/executor/msgspec_utils.py @@ -1,6 +1,8 @@ from array import array from typing import Any, Type +from vllm.sequence import VLLM_TOKEN_ID_ARRAY_TYPE + def encode_hook(obj: Any) -> Any: """Custom msgspec enc hook that supports array types. @@ -8,6 +10,7 @@ def encode_hook(obj: Any) -> Any: See https://jcristharif.com/msgspec/api.html#msgspec.msgpack.Encoder """ if isinstance(obj, array): + assert obj.typecode == "l" return obj.tobytes() else: raise ValueError(f"Unsupported serialization type: {type(obj)}") @@ -19,7 +22,7 @@ def decode_hook(type: Type, obj: Any) -> Any: See https://jcristharif.com/msgspec/api.html#msgspec.msgpack.Encoder """ if type is array: - deserialized = array('I') + deserialized = array(VLLM_TOKEN_ID_ARRAY_TYPE) deserialized.frombytes(obj) return deserialized else: diff --git a/vllm/inputs/registry.py b/vllm/inputs/registry.py index 576bfabff5a..7c17895bf1d 100644 --- a/vllm/inputs/registry.py +++ b/vllm/inputs/registry.py @@ -22,6 +22,10 @@ C = TypeVar("C", bound=PretrainedConfig, default=PretrainedConfig) +# NOTE: This has to match with sequence.py's VLLM_TOKEN_ID_ARRAY_TYPE. +# We cannot import it here because of circular dependencies. +VLLM_TOKEN_ID_ARRAY_TYPE = "l" + @dataclass(frozen=True) class InputContext: @@ -133,7 +137,8 @@ def _default_dummy_data_factory( # Avoid circular import from vllm.sequence import SequenceData - dummy_seq_data = SequenceData(array("I", [0]) * seq_len) + dummy_seq_data = SequenceData( + array(VLLM_TOKEN_ID_ARRAY_TYPE, [0]) * seq_len) dummy_multi_modal_data = None return dummy_seq_data, dummy_multi_modal_data diff --git a/vllm/model_executor/layers/sampler.py b/vllm/model_executor/layers/sampler.py index 111950774a1..41abdf211e7 100644 --- a/vllm/model_executor/layers/sampler.py +++ b/vllm/model_executor/layers/sampler.py @@ -795,7 +795,6 @@ def _get_logprobs( # Update indices and tokens for prompt logprobs. if (seq_group.is_prompt and sampling_params.prompt_logprobs is not None): - print(f"SANG-TODO {sampling_params.prompt_logprobs=}") largest_num_logprobs = max(largest_num_logprobs, sampling_params.prompt_logprobs) next_prompt_tokens = _get_next_prompt_tokens(seq_group) @@ -815,7 +814,6 @@ def _get_logprobs( next_token_ids.extend(token_ids) if sampling_params.logprobs is not None: - print(f"SANG-TODO {sampling_params.logprobs=}") largest_num_logprobs = max(largest_num_logprobs, sampling_params.logprobs) @@ -854,7 +852,6 @@ def _get_logprobs( if largest_num_logprobs > 0: # Logprobs of topk tokens for a batch of sequence groups. # (num_query_tokens_across_batch). - print(f"SANG-TODO {largest_num_logprobs=}") top_logprobs, top_token_ids = torch.topk(logprobs, largest_num_logprobs, dim=-1) diff --git a/vllm/model_executor/models/blip.py b/vllm/model_executor/models/blip.py index 1be91c1a9e9..69e777152e3 100644 --- a/vllm/model_executor/models/blip.py +++ b/vllm/model_executor/models/blip.py @@ -17,7 +17,7 @@ from vllm.model_executor.layers.quantization import QuantizationConfig from vllm.multimodal.image import (cached_get_tokenizer, repeat_and_pad_image_tokens) -from vllm.sequence import SequenceData +from vllm.sequence import VLLM_TOKEN_ID_ARRAY_TYPE, SequenceData def get_blip_patch_grid_length(*, image_size: int, patch_size: int) -> int: @@ -54,8 +54,10 @@ def dummy_seq_data_for_blip( else: image_feature_size = image_feature_size_override - token_ids = array("I", [image_token_id]) * image_feature_size - token_ids += array("I", [0]) * (seq_len - image_feature_size) + token_ids = array(VLLM_TOKEN_ID_ARRAY_TYPE, + [image_token_id]) * image_feature_size + token_ids += array(VLLM_TOKEN_ID_ARRAY_TYPE, + [0]) * (seq_len - image_feature_size) return SequenceData(token_ids) diff --git a/vllm/model_executor/models/blip2.py b/vllm/model_executor/models/blip2.py index 6509c8485bf..8cfd3c26725 100644 --- a/vllm/model_executor/models/blip2.py +++ b/vllm/model_executor/models/blip2.py @@ -18,7 +18,8 @@ from vllm.model_executor.models.opt import OPTModel from vllm.model_executor.sampling_metadata import SamplingMetadata from vllm.multimodal import MULTIMODAL_REGISTRY -from vllm.sequence import IntermediateTensors, SamplerOutput, SequenceData +from vllm.sequence import (VLLM_TOKEN_ID_ARRAY_TYPE, IntermediateTensors, + SamplerOutput, SequenceData) from .blip import (BlipVisionModel, dummy_image_for_blip, get_max_blip_image_tokens) @@ -428,8 +429,10 @@ def dummy_seq_data_for_blip2( else: image_feature_size = image_feature_size_override - token_ids = array("I", [image_token_id]) * image_feature_size * num_images - token_ids += array("I", [0]) * (seq_len - image_feature_size * num_images) + token_ids = array(VLLM_TOKEN_ID_ARRAY_TYPE, + [image_token_id]) * image_feature_size * num_images + token_ids += array(VLLM_TOKEN_ID_ARRAY_TYPE, + [0]) * (seq_len - image_feature_size * num_images) return SequenceData(token_ids) diff --git a/vllm/model_executor/models/chameleon.py b/vllm/model_executor/models/chameleon.py index 4cfa8802f75..788d22db9d5 100644 --- a/vllm/model_executor/models/chameleon.py +++ b/vllm/model_executor/models/chameleon.py @@ -32,7 +32,8 @@ from vllm.multimodal import MULTIMODAL_REGISTRY from vllm.multimodal.image import (cached_get_tokenizer, repeat_and_pad_image_tokens) -from vllm.sequence import IntermediateTensors, SamplerOutput, SequenceData +from vllm.sequence import (VLLM_TOKEN_ID_ARRAY_TYPE, IntermediateTensors, + SamplerOutput, SequenceData) from vllm.utils import print_warning_once from .interfaces import SupportsMultiModal @@ -71,8 +72,10 @@ def dummy_seq_data_for_chameleon( else: image_feature_size = image_feature_size_override - token_ids = array("I", [image_token_id]) * image_feature_size * num_images - token_ids += array("I", [0]) * (seq_len - image_feature_size * num_images) + token_ids = array(VLLM_TOKEN_ID_ARRAY_TYPE, + [image_token_id]) * image_feature_size * num_images + token_ids += array(VLLM_TOKEN_ID_ARRAY_TYPE, + [0]) * (seq_len - image_feature_size * num_images) return SequenceData(token_ids) diff --git a/vllm/model_executor/models/clip.py b/vllm/model_executor/models/clip.py index 4b4f1353288..24eeefdfccf 100644 --- a/vllm/model_executor/models/clip.py +++ b/vllm/model_executor/models/clip.py @@ -18,7 +18,7 @@ from vllm.model_executor.model_loader.weight_utils import default_weight_loader from vllm.multimodal.image import (cached_get_tokenizer, repeat_and_pad_image_tokens) -from vllm.sequence import SequenceData +from vllm.sequence import VLLM_TOKEN_ID_ARRAY_TYPE, SequenceData def get_clip_patch_grid_length(*, image_size: int, patch_size: int) -> int: @@ -54,8 +54,10 @@ def dummy_seq_data_for_clip( else: image_feature_size = image_feature_size_override - token_ids = array("I", [image_token_id]) * image_feature_size * num_images - token_ids += array("I", [0]) * (seq_len - image_feature_size * num_images) + token_ids = array(VLLM_TOKEN_ID_ARRAY_TYPE, + [image_token_id]) * image_feature_size * num_images + token_ids += array(VLLM_TOKEN_ID_ARRAY_TYPE, + [0]) * (seq_len - image_feature_size * num_images) return SequenceData(token_ids) diff --git a/vllm/model_executor/models/fuyu.py b/vllm/model_executor/models/fuyu.py index 71cd7511ccf..2ef23819b69 100644 --- a/vllm/model_executor/models/fuyu.py +++ b/vllm/model_executor/models/fuyu.py @@ -38,7 +38,8 @@ from vllm.multimodal.base import MultiModalInputs from vllm.multimodal.image import (cached_get_image_processor, cached_get_tokenizer) -from vllm.sequence import IntermediateTensors, SamplerOutput, SequenceData +from vllm.sequence import (VLLM_TOKEN_ID_ARRAY_TYPE, IntermediateTensors, + SamplerOutput, SequenceData) from .interfaces import SupportsMultiModal from .utils import merge_multimodal_embeddings @@ -98,10 +99,12 @@ def dummy_seq_data_for_fuyu(ctx: InputContext, seq_len: int, num_images: int): ncol, nrow = get_max_fuyu_image_feature_size() image_feature_size = get_max_fuyu_image_tokens(ctx) - image_token_ids = (array("I", [_IMAGE_TOKEN_ID]) * ncol + - array("I", [_NEWLINE_TOKEN_ID])) * nrow - token_ids = array("I", image_token_ids) * num_images - token_ids += array("I", [0]) * (seq_len - image_feature_size * num_images) + image_token_ids = ( + array(VLLM_TOKEN_ID_ARRAY_TYPE, [_IMAGE_TOKEN_ID]) * ncol + + array(VLLM_TOKEN_ID_ARRAY_TYPE, [_NEWLINE_TOKEN_ID])) * nrow + token_ids = array(VLLM_TOKEN_ID_ARRAY_TYPE, image_token_ids) * num_images + token_ids += array(VLLM_TOKEN_ID_ARRAY_TYPE, + [0]) * (seq_len - image_feature_size * num_images) return SequenceData(token_ids) diff --git a/vllm/model_executor/models/minicpmv.py b/vllm/model_executor/models/minicpmv.py index 11841093d6c..729bd27c334 100644 --- a/vllm/model_executor/models/minicpmv.py +++ b/vllm/model_executor/models/minicpmv.py @@ -56,7 +56,8 @@ from vllm.multimodal import MULTIMODAL_REGISTRY from vllm.multimodal.image import (cached_get_image_processor, cached_get_tokenizer) -from vllm.sequence import IntermediateTensors, SamplerOutput, SequenceData +from vllm.sequence import (VLLM_TOKEN_ID_ARRAY_TYPE, IntermediateTensors, + SamplerOutput, SequenceData) from .idefics2_vision_model import Idefics2VisionTransformer @@ -409,7 +410,7 @@ def get_max_minicpmv_image_tokens(ctx: InputContext): def dummy_seq_data_for_minicpmv(seq_len: int, num_images: int): - token_ids = array("I", [0]) * seq_len + token_ids = array(VLLM_TOKEN_ID_ARRAY_TYPE, [0]) * seq_len return SequenceData(token_ids) diff --git a/vllm/model_executor/models/siglip.py b/vllm/model_executor/models/siglip.py index 2d4f86fe995..426af7fee95 100644 --- a/vllm/model_executor/models/siglip.py +++ b/vllm/model_executor/models/siglip.py @@ -26,7 +26,7 @@ from vllm.model_executor.model_loader.weight_utils import default_weight_loader from vllm.multimodal.image import (cached_get_tokenizer, repeat_and_pad_image_tokens) -from vllm.sequence import SequenceData +from vllm.sequence import VLLM_TOKEN_ID_ARRAY_TYPE, SequenceData def get_siglip_patch_grid_length(*, image_size: int, patch_size: int) -> int: @@ -63,8 +63,10 @@ def dummy_seq_data_for_siglip( else: image_feature_size = image_feature_size_override - token_ids = array("I", [image_token_id]) * image_feature_size - token_ids += array("I", [0]) * (seq_len - image_feature_size) + token_ids = array(VLLM_TOKEN_ID_ARRAY_TYPE, + [image_token_id]) * image_feature_size + token_ids += array(VLLM_TOKEN_ID_ARRAY_TYPE, + [0]) * (seq_len - image_feature_size) return SequenceData(token_ids) diff --git a/vllm/model_executor/sampling_metadata.py b/vllm/model_executor/sampling_metadata.py index ead51bf277c..a085779bc61 100644 --- a/vllm/model_executor/sampling_metadata.py +++ b/vllm/model_executor/sampling_metadata.py @@ -6,7 +6,8 @@ import torch from vllm.sampling_params import SamplingParams, SamplingType -from vllm.sequence import SequenceData, SequenceGroupMetadata +from vllm.sequence import (VLLM_TOKEN_ID_ARRAY_TYPE, SequenceData, + SequenceGroupMetadata) from vllm.triton_utils.sample import get_num_triton_sampler_splits from vllm.utils import (PyObjectCache, async_tensor_h2d, is_pin_memory_available, make_tensor_with_pad, @@ -505,9 +506,11 @@ def from_sampling_metadata( and sampling_params.prompt_logprobs is not None): prefill_len = len(seq_group.prompt_logprob_indices) prompt_tokens.extend( - array('I') for _ in range(prefill_len)) + array(VLLM_TOKEN_ID_ARRAY_TYPE) + for _ in range(prefill_len)) output_tokens.extend( - array('I') for _ in range(prefill_len)) + array(VLLM_TOKEN_ID_ARRAY_TYPE) + for _ in range(prefill_len)) if seq_group.do_sample: for seq_id in seq_ids: seq_data = seq_group.seq_data[seq_id] diff --git a/vllm/sequence.py b/vllm/sequence.py index f27a9a35c12..04c45c1831e 100644 --- a/vllm/sequence.py +++ b/vllm/sequence.py @@ -23,6 +23,8 @@ from vllm.inputs import LLMInputs from vllm.multimodal.base import MultiModalDataDict +VLLM_TOKEN_ID_ARRAY_TYPE = "l" + class Logprob( msgspec.Struct, @@ -148,7 +150,7 @@ class SequenceData(msgspec.Struct, # union of 2 list types. _prompt_token_ids: array _output_token_ids: array = msgspec.field( - default_factory=lambda: array("I", [])) + default_factory=lambda: array(VLLM_TOKEN_ID_ARRAY_TYPE, [])) ### The below fields should not be passed as an argument ### _cumulative_logprob: float = 0.0 @@ -164,6 +166,8 @@ class SequenceData(msgspec.Struct, _new_appended_tokens: List[int] = msgspec.field(default_factory=list) def __post_init__(self) -> None: + assert self._prompt_token_ids.typecode == "l" + assert self._output_token_ids.typecode == "l" self._prompt_token_ids_tuple: Tuple[int, ...] = tuple( self._prompt_token_ids) self._update_cached_all_tokens() @@ -201,7 +205,8 @@ def output_token_ids(self) -> Tuple[int, ...]: @output_token_ids.setter def output_token_ids(self, new_output_token_ids: List[int]) -> None: - self._output_token_ids = array('I', new_output_token_ids) + self._output_token_ids = array(VLLM_TOKEN_ID_ARRAY_TYPE, + new_output_token_ids) self._update_cached_all_tokens() @property @@ -383,7 +388,8 @@ def __init__( f"invalid input {inputs}; did you forget the " "encoder input prompt fields?") - self.data = SequenceData(array("I", self.prompt_token_ids)) + self.data = SequenceData( + array(VLLM_TOKEN_ID_ARRAY_TYPE, self.prompt_token_ids)) self.output_logprobs: SampleLogprobs = [] self.output_text = "" diff --git a/vllm/spec_decode/batch_expansion.py b/vllm/spec_decode/batch_expansion.py index f0c26d36633..aec4847b96c 100644 --- a/vllm/spec_decode/batch_expansion.py +++ b/vllm/spec_decode/batch_expansion.py @@ -5,8 +5,9 @@ import torch from vllm import SamplingParams -from vllm.sequence import (ExecuteModelRequest, SamplerOutput, SequenceData, - SequenceGroupMetadata, get_all_seq_ids) +from vllm.sequence import (VLLM_TOKEN_ID_ARRAY_TYPE, ExecuteModelRequest, + SamplerOutput, SequenceData, SequenceGroupMetadata, + get_all_seq_ids) from vllm.spec_decode.interfaces import (SpeculativeProposals, SpeculativeScorer, SpeculativeScores) from vllm.spec_decode.util import (nvtx_range, sampler_output_to_torch, @@ -301,7 +302,8 @@ def _create_single_target_seq_group_metadata( target_seq_id: SequenceData( prompt_token_ids, - _output_token_ids=array("I", new_output_token_ids), + _output_token_ids=array(VLLM_TOKEN_ID_ARRAY_TYPE, + new_output_token_ids), ), } # This is a hack. Technically, spec decoding should compute From c92187781e8be3ac71e7557405173b8b9ab603a3 Mon Sep 17 00:00:00 2001 From: sang Date: Sat, 17 Aug 2024 11:22:59 -0700 Subject: [PATCH 33/36] fixed --- vllm/sequence.py | 6 ++---- 1 file changed, 2 insertions(+), 4 deletions(-) diff --git a/vllm/sequence.py b/vllm/sequence.py index 04c45c1831e..fc60f36969e 100644 --- a/vllm/sequence.py +++ b/vllm/sequence.py @@ -26,10 +26,8 @@ VLLM_TOKEN_ID_ARRAY_TYPE = "l" -class Logprob( - msgspec.Struct, - omit_defaults=True, # type: ignore[call-arg] - array_like=True): # type: ignore[call-arg] +@dataclass +class Logprob: """Infos for supporting OpenAI compatible logprobs and token ranks. Attributes: From ae1fb21fff41b1356e3174f61132dc3d03c5ed5f Mon Sep 17 00:00:00 2001 From: sang Date: Sat, 17 Aug 2024 11:30:45 -0700 Subject: [PATCH 34/36] temporarily use dataclass --- vllm/entrypoints/openai/protocol.py | 6 ------ vllm/sequence.py | 3 +++ 2 files changed, 3 insertions(+), 6 deletions(-) diff --git a/vllm/entrypoints/openai/protocol.py b/vllm/entrypoints/openai/protocol.py index 6aad0ed6929..aef42e9425e 100644 --- a/vllm/entrypoints/openai/protocol.py +++ b/vllm/entrypoints/openai/protocol.py @@ -526,9 +526,6 @@ class CompletionLogProbs(OpenAIBaseModel): class CompletionResponseChoice(OpenAIBaseModel): - # For Logprob because it is msgspec.Struct - model_config = ConfigDict(arbitrary_types_allowed=True) - index: int text: str logprobs: Optional[CompletionLogProbs] = None @@ -630,9 +627,6 @@ class ChatCompletionResponseChoice(OpenAIBaseModel): class ChatCompletionResponse(OpenAIBaseModel): - # For Logprob because it is msgspec.Struct - model_config = ConfigDict(arbitrary_types_allowed=True) - id: str = Field(default_factory=lambda: f"chatcmpl-{random_uuid()}") object: Literal["chat.completion"] = "chat.completion" created: int = Field(default_factory=lambda: int(time.time())) diff --git a/vllm/sequence.py b/vllm/sequence.py index fc60f36969e..864136854d2 100644 --- a/vllm/sequence.py +++ b/vllm/sequence.py @@ -26,6 +26,9 @@ VLLM_TOKEN_ID_ARRAY_TYPE = "l" +# We use dataclass for now because it is used for +# openai server output, and msgspec is not serializable. +# TODO(sang): Fix it. @dataclass class Logprob: """Infos for supporting OpenAI compatible logprobs and token ranks. From 3e1325ef7c8eb6b5babe842738037dc3a82bca5a Mon Sep 17 00:00:00 2001 From: sang Date: Sun, 18 Aug 2024 11:36:42 -0700 Subject: [PATCH 35/36] Addressed code review. --- tests/distributed/test_basic_distributed_correctness.py | 1 - tests/distributed/test_chunked_prefill_distributed.py | 1 - vllm/executor/msgspec_utils.py | 9 ++++----- vllm/sampling_params.py | 7 +++++-- vllm/sequence.py | 4 ++-- 5 files changed, 11 insertions(+), 11 deletions(-) diff --git a/tests/distributed/test_basic_distributed_correctness.py b/tests/distributed/test_basic_distributed_correctness.py index 7f373a01aa5..e254686f269 100644 --- a/tests/distributed/test_basic_distributed_correctness.py +++ b/tests/distributed/test_basic_distributed_correctness.py @@ -24,7 +24,6 @@ @pytest.mark.parametrize( "model, distributed_executor_backend, attention_backend, " "test_suite", [ - ("facebook/opt-125m", "ray", "", "L4"), ("facebook/opt-125m", "ray", "", "L4"), ("facebook/opt-125m", "mp", "", "L4"), ("meta-llama/Llama-2-7b-hf", "ray", "", "L4"), diff --git a/tests/distributed/test_chunked_prefill_distributed.py b/tests/distributed/test_chunked_prefill_distributed.py index 83601a81d2d..262845f1982 100644 --- a/tests/distributed/test_chunked_prefill_distributed.py +++ b/tests/distributed/test_chunked_prefill_distributed.py @@ -19,7 +19,6 @@ @pytest.mark.skipif(cuda_device_count_stateless() < 2, reason="Need at least 2 GPUs to run the test.") @pytest.mark.parametrize("model, distributed_executor_backend", [ - ("facebook/opt-125m", "ray"), ("facebook/opt-125m", "ray"), ("meta-llama/Llama-2-7b-hf", "ray"), ("facebook/opt-125m", "mp"), diff --git a/vllm/executor/msgspec_utils.py b/vllm/executor/msgspec_utils.py index 7d07220206e..48f4a270daa 100644 --- a/vllm/executor/msgspec_utils.py +++ b/vllm/executor/msgspec_utils.py @@ -10,10 +10,11 @@ def encode_hook(obj: Any) -> Any: See https://jcristharif.com/msgspec/api.html#msgspec.msgpack.Encoder """ if isinstance(obj, array): - assert obj.typecode == "l" + assert obj.typecode == VLLM_TOKEN_ID_ARRAY_TYPE, ( + f"vLLM array type should use '{VLLM_TOKEN_ID_ARRAY_TYPE}' type. " + f"Given array has a type code of {obj.typecode}." + ) return obj.tobytes() - else: - raise ValueError(f"Unsupported serialization type: {type(obj)}") def decode_hook(type: Type, obj: Any) -> Any: @@ -25,5 +26,3 @@ def decode_hook(type: Type, obj: Any) -> Any: deserialized = array(VLLM_TOKEN_ID_ARRAY_TYPE) deserialized.frombytes(obj) return deserialized - else: - raise ValueError(f"Unsupported deserialization type: {type(obj)}") diff --git a/vllm/sampling_params.py b/vllm/sampling_params.py index 1371e658c8b..2a4aeb83d23 100644 --- a/vllm/sampling_params.py +++ b/vllm/sampling_params.py @@ -6,6 +6,7 @@ import msgspec import torch from typing_extensions import Annotated +from functools import cached_property from vllm.logger import init_logger @@ -33,7 +34,9 @@ class SamplingType(IntEnum): class SamplingParams(msgspec.Struct, - omit_defaults=True): # type: ignore[call-arg] + omit_defaults=True, # type: ignore[call-arg] + # required for @cached_property. + dict=True): # type: ignore[call-arg] """Sampling parameters for text generation. Overall, we follow the sampling parameters from the OpenAI text completion @@ -309,7 +312,7 @@ def update_from_generation_config( eos_ids.update(self.stop_token_ids) self.stop_token_ids = list(eos_ids) - @property + @cached_property def sampling_type(self) -> SamplingType: if self.use_beam_search: return SamplingType.BEAM diff --git a/vllm/sequence.py b/vllm/sequence.py index 864136854d2..b15955cde76 100644 --- a/vllm/sequence.py +++ b/vllm/sequence.py @@ -913,8 +913,8 @@ class SequenceGroupMetadata( def __post_init__(self): if self.seq_data is not None and self.token_chunk_size is None: if self.is_prompt: - self.token_chunk_size = list( - self.seq_data.values())[0].get_len() + self.token_chunk_size = next(iter( + self.seq_data.values())).get_len() else: self.token_chunk_size = 1 From 652c258e4ac29880cba948251bacc04dbd8fa457 Mon Sep 17 00:00:00 2001 From: sang Date: Sun, 18 Aug 2024 11:40:32 -0700 Subject: [PATCH 36/36] lint --- vllm/executor/msgspec_utils.py | 3 +-- vllm/sampling_params.py | 11 ++++++----- 2 files changed, 7 insertions(+), 7 deletions(-) diff --git a/vllm/executor/msgspec_utils.py b/vllm/executor/msgspec_utils.py index 48f4a270daa..c467115f124 100644 --- a/vllm/executor/msgspec_utils.py +++ b/vllm/executor/msgspec_utils.py @@ -12,8 +12,7 @@ def encode_hook(obj: Any) -> Any: if isinstance(obj, array): assert obj.typecode == VLLM_TOKEN_ID_ARRAY_TYPE, ( f"vLLM array type should use '{VLLM_TOKEN_ID_ARRAY_TYPE}' type. " - f"Given array has a type code of {obj.typecode}." - ) + f"Given array has a type code of {obj.typecode}.") return obj.tobytes() diff --git a/vllm/sampling_params.py b/vllm/sampling_params.py index 2a4aeb83d23..7197b513985 100644 --- a/vllm/sampling_params.py +++ b/vllm/sampling_params.py @@ -1,12 +1,12 @@ """Sampling parameters for text generation.""" import copy from enum import IntEnum +from functools import cached_property from typing import Any, Callable, Dict, List, Optional, Set, Union import msgspec import torch from typing_extensions import Annotated -from functools import cached_property from vllm.logger import init_logger @@ -33,10 +33,11 @@ class SamplingType(IntEnum): to sample from.""" -class SamplingParams(msgspec.Struct, - omit_defaults=True, # type: ignore[call-arg] - # required for @cached_property. - dict=True): # type: ignore[call-arg] +class SamplingParams( + msgspec.Struct, + omit_defaults=True, # type: ignore[call-arg] + # required for @cached_property. + dict=True): # type: ignore[call-arg] """Sampling parameters for text generation. Overall, we follow the sampling parameters from the OpenAI text completion