This repository was archived by the owner on Feb 15, 2025. It is now read-only.
-
Notifications
You must be signed in to change notification settings - Fork 33
/
Copy pathzarf.yaml
128 lines (125 loc) · 5.51 KB
/
zarf.yaml
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
# yaml-language-server: $schema=https://raw.githubusercontent.com/defenseunicorns/uds-cli/v0.16.0/zarf.schema.json
kind: ZarfPackageConfig
metadata:
name: vllm
version: "###ZARF_PKG_TMPL_IMAGE_VERSION###"
description: >
vLLM model
constants:
- name: IMAGE_VERSION
value: "###ZARF_PKG_TMPL_IMAGE_VERSION###"
- name: MODEL_REPO_ID
description: "The HuggingFace repository ID"
value: "###ZARF_PKG_TMPL_MODEL_REPO_ID###"
- name: MODEL_REVISION
description: "The HuggingFace git branch or commit hash"
value: "###ZARF_PKG_TMPL_MODEL_REVISION###"
- name: MODEL_PATH
description: "Defines the location of the Zarf Injected model files in the vLLM container"
value: "###ZARF_PKG_TMPL_MODEL_PATH###"
- name: NAME_OVERRIDE
description: "Provide an override for the name of the deployment (e.g., the model name)"
value: "###ZARF_PKG_TMPL_NAME_OVERRIDE###"
variables:
# vLLM runtime configuration (usually influenced by .env in local development)
- name: TRUST_REMOTE_CODE
description: "If True, allows the execution of code within the model files directory"
pattern: "^(True|False)$"
- name: TENSOR_PARALLEL_SIZE
description: "The number of tensor parallelism splits, typically used for model parallelism across GPUs"
pattern: "^[1-9][0-9]*$"
- name: ENFORCE_EAGER
description: "If set to True, enforces eager execution mode instead of lazy execution, impacting performance"
pattern: "^(True|False)$"
- name: GPU_MEMORY_UTILIZATION
description: "The fraction of GPU memory to be utilized, expressed as a decimal value between 0.01 and 0.99"
pattern: ^0\.(0[1-9]|[1-9][0-9])$
- name: WORKER_USE_RAY
description: "If True, uses Ray for distributed worker management"
pattern: "^(True|False)$"
- name: ENGINE_USE_RAY
description: "If True, uses Ray for managing the execution engine"
pattern: "^(True|False)$"
- name: QUANTIZATION
description: "If None, allows vLLM to automatically detect via model files and configuration"
- name: LOAD_FORMAT
description: "If auto, allows vLLM to automatically detect via model files and configuration"
# LeapfrogAI SDK runtime configuration (usually influenced by config.yaml in development)
- name: MAX_CONTEXT_LENGTH
description: "The maximum number of tokens the model can process in a single input before the inferencing engine's overflow strategy is used"
pattern: "^[1-9][0-9]*$"
- name: STOP_TOKENS
description: "A set of special tokens that signal the model to stop producing further output, delimited using a comma and space"
pattern: ^(<[^,]+>\s*,\s*)*<[^,]+>\s*$
- name: PROMPT_FORMAT_CHAT_SYSTEM
description: "Prompt template format for the LeapfrogAI SDK to consume and wrap"
- name: PROMPT_FORMAT_CHAT_USER
description: "Prompt template format for the LeapfrogAI SDK to consume and wrap"
- name: PROMPT_FORMAT_CHAT_ASSISTANT
description: "Prompt template format for the LeapfrogAI SDK to consume and wrap"
- name: TEMPERATURE
description: "Controls the randomness of the model's output"
pattern: ^(0(\.\d+)?|1(\.0+)?)$
- name: TOP_P
description: "The cumulative probability threshold for token sampling, where 1.0 represents no restriction"
pattern: ^(0(\.\d+)?|1(\.0+)?)$
- name: TOP_K
description: "The number of top-K tokens to consider during sampling, where 0 disables top-K sampling"
pattern: ^\d+$
- name: REPETITION_PENALTY
description: "The penalty value for repetition in generation"
pattern: ^(0(\.\d+)?|1(\.0+)?)$
- name: MAX_NEW_TOKENS
description: "Maximum new tokens to generate"
pattern: ^\d+$
# Pod deployment configuration
- name: GPU_LIMIT
description: "The GPU limit for the model inferencing. Must be 1 or more."
pattern: "^[1-9][0-9]*$"
- name: GPU_RUNTIME
description: "The GPU runtime name for the model inferencing."
pattern: "^(nvidia)?$"
- name: PVC_SIZE
description: "Size of the PVC used for model storage."
pattern: "^[0-9]+[a-zA-Z]+$"
- name: PVC_ACCESS_MODE
description: "Access mode of the PVC used for model storage."
pattern: "^(ReadWriteOnce|ReadOnlyMany|ReadWriteMany)$"
- name: PVC_STORAGE_CLASS
description: "Storage class of the PVC used for model storage."
components:
- name: vllm-model
required: true
only:
flavor: upstream
charts:
- name: "###ZARF_PKG_TMPL_NAME_OVERRIDE###-model"
namespace: leapfrogai
localPath: chart
releaseName: "###ZARF_PKG_TMPL_NAME_OVERRIDE###-model"
# x-release-please-start-version
version: 0.14.0
# x-release-please-end
valuesFiles:
- "values/upstream-values.yaml"
images:
- "ghcr.io/defenseunicorns/leapfrogai/vllm:###ZARF_PKG_TMPL_IMAGE_VERSION###"
- "cgr.dev/chainguard/bash:latest"
dataInjections:
# location where locally downloaded model files are located
- source: ".model/"
target:
namespace: "leapfrogai"
selector: "app=lfai-###ZARF_PKG_TMPL_NAME_OVERRIDE###"
container: "data-loader"
# location in the container for injection of the model files
path: "###ZARF_PKG_TMPL_MODEL_PATH###"
compress: true
actions:
onCreate:
before:
# NOTE: This assumes python is installed and in $PATH and 'huggingface_hub[cli,hf_transfer]' has been installed
- cmd: "python src/model_download.py"
env:
- LFAI_REPO_ID=###ZARF_PKG_TMPL_MODEL_REPO_ID###
- LFAI_REVISION=###ZARF_PKG_TMPL_MODEL_REVISION###