-
Notifications
You must be signed in to change notification settings - Fork 773
Expand file tree
/
Copy pathvalues.yaml
More file actions
312 lines (271 loc) · 11.8 KB
/
values.yaml
File metadata and controls
312 lines (271 loc) · 11.8 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
# Default values for kuberay-operator.
# This is a YAML-formatted file.
# Declare variables to be passed into your templates.
# -- String to partially override release name.
nameOverride: kuberay-operator
# -- String to fully override release name.
fullnameOverride: kuberay-operator
# -- String to override component name.
componentOverride: kuberay-operator
# -- Number of replicas for the KubeRay operator Deployment.
replicas: 1
image:
# -- Image repository.
repository: quay.io/kuberay/operator
# -- Image tag.
tag: nightly
# -- Image pull policy.
pullPolicy: IfNotPresent
# -- Secrets with credentials to pull images from a private registry
imagePullSecrets: []
# -- Restrict to run on particular nodes.
nodeSelector: {}
# -- Pod priorityClassName
priorityClassName: ""
# -- Extra labels.
labels: {}
# -- Extra annotations.
annotations: {}
# -- Pod affinity
affinity: {}
# -- Pod tolerations
tolerations: []
serviceAccount:
# -- Specifies whether a service account should be created.
create: true
# -- The name of the service account to use.
# If not set and create is true, a name is generated using the fullname template.
name: kuberay-operator
logging:
# -- Log encoder to use for stdout (one of `json` or `console`).
stdoutEncoder: json
# -- Log encoder to use for file logging (one of `json` or `console`).
fileEncoder: json
# -- Directory for kuberay-operator log file.
baseDir: ""
# -- File name for kuberay-operator log file.
fileName: ""
# -- EmptyDir volume size limit for kuberay-operator log file.
sizeLimit: ""
# Enable customized Kubernetes scheduler integration. If enabled, Ray workloads will be scheduled
# by the customized scheduler.
# * "enabled" is the legacy option and will be deprecated soon.
# * "name" is the standard option, expecting a scheduler name, supported values are
# "default", "volcano", "yunikorn", and "scheduler-plugins".
#
# Note: "enabled" and "name" should not be set at the same time. If both are set, an error will be thrown.
#
# Examples:
# 1. Use volcano (deprecated)
# batchScheduler:
# enabled: true
#
# 2. Use volcano
# batchScheduler:
# name: volcano
#
# 3. Use yunikorn
# batchScheduler:
# name: yunikorn
#
# 4. Use PodGroup
# batchScheduler:
# name: scheduler-plugins
# 5. Use Kai Scheduler
# batchScheduler:
# name: kai-scheduler
batchScheduler:
# Deprecated. This option will be removed in the future.
# Note, for backwards compatibility. When it sets to true, it enables volcano scheduler integration.
enabled: false
# Set the customized scheduler name, supported values are "volcano", "yunikorn", "kai-scheduler" or "scheduler-plugins", do not set
# "batchScheduler.enabled=true" at the same time as it will override this option.
name: ""
# Configuration for the KubeRay operator.
configuration:
# -- Whether to enable the configuration feature. If enabled, a ConfigMap will be created and mounted to the operator.
# When enabled, flag-based configuration values (leaderElectionEnabled, metrics.enabled, kubeClient.qps, etc.)
# will be injected into the ConfigMap. The operator will use the ConfigMap and ignore command-line flags.
enabled: false
# -- Default environment variables to inject into all Ray containers in all RayCluster CRs.
# This allows user to set feature flags across all Ray pods.
# Example:
# defaultContainerEnvs:
# - name: RAY_enable_open_telemetry
# value: "true"
# - name: RAY_metric_cardinality_level
# value: "recommended"
defaultContainerEnvs: []
# -- Sidecar containers to inject into every Ray head pod.
# Example:
# headSidecarContainers:
# - name: fluentbit
# image: fluent/fluent-bit:1.9
headSidecarContainers: []
# -- Sidecar containers to inject into every Ray worker pod.
# Example:
# workerSidecarContainers:
# - name: fluentbit
# image: fluent/fluent-bit:1.9
workerSidecarContainers: []
featureGates:
- name: RayClusterStatusConditions
enabled: true
- name: RayJobDeletionPolicy
enabled: true
- name: RayMultiHostIndexing
enabled: true
- name: RayServiceIncrementalUpgrade
enabled: false
- name: RayCronJob
enabled: false
# Configurations for KubeRay operator metrics.
metrics:
# -- Whether KubeRay operator should emit control plane metrics.
enabled: true
serviceMonitor:
# -- Enable a prometheus ServiceMonitor
enabled: false
# -- Prometheus ServiceMonitor interval
interval: 30s
# -- When true, honorLabels preserves the metric’s labels when they collide with the target’s labels.
honorLabels: true
# -- Prometheus ServiceMonitor selector
selector: {}
# release: prometheus
# -- Prometheus ServiceMonitor namespace
namespace: "" # "monitoring"
# -- Path to the operator binary
operatorCommand: /manager
# if userKubernetesProxy is set to true, the KubeRay operator will be configured with the --use-kubernetes-proxy flag.
# Using this option to configure kuberay-operator to comunitcate to Ray head pods by proxying through the Kubernetes API Server.
# useKubernetesProxy: true
# -- If leaderElectionEnabled is set to true, the KubeRay operator will use leader election for high availability.
leaderElectionEnabled: true
# -- The maximum number of reconcile operations that can be performed simultaneously.
# This setting controls the concurrency of the controller reconciliation loops.
# Higher values can improve throughput in clusters with many resources, but may increase resource consumption.
reconcileConcurrency: 1
# -- Kube Client configuration for QPS and burst settings.
# This setting controls the QPS and burst rate of the kube client when sending requests to the Kubernetes API server.
# If the QPS and burst values are too low, we may easily hit rate limits on the API server and slow down the controller reconciliation loops.
kubeClient:
# -- The QPS value for the client communicating with the Kubernetes API server.
# Must be a float number.
qps: 100.0
# -- The maximum burst for throttling requests from this client to the Kubernetes API server.
# Must be a non-negative integer.
burst: 200
# -- If rbacEnable is set to false, no RBAC resources will be created, including the Role for leader election, the Role for Pods and Services, and so on.
rbacEnable: true
# -- When crNamespacedRbacEnable is set to true, the KubeRay operator will create a Role for RayCluster preparation (e.g., Pods, Services)
# and a corresponding RoleBinding for each namespace listed in the "watchNamespace" parameter. Please note that even if crNamespacedRbacEnable
# is set to false, the Role and RoleBinding for leader election will still be created.
#
# Note:
# (1) This variable is only effective when rbacEnable and singleNamespaceInstall are both set to true.
# (2) In most cases, it should be set to true, unless you are using a Kubernetes cluster managed by GitOps tools such as ArgoCD.
crNamespacedRbacEnable: true
# -- When singleNamespaceInstall is true:
# - Install namespaced RBAC resources such as Role and RoleBinding instead of cluster-scoped ones like ClusterRole and ClusterRoleBinding so that
# the chart can be installed by users with permissions restricted to a single namespace.
# (Please note that this excludes the CRDs, which can only be installed at the cluster scope.)
# - If "watchNamespace" is not set, the KubeRay operator will, by default, only listen
# to resource events within its own namespace.
singleNamespaceInstall: false
# The KubeRay operator will watch the custom resources in the namespaces listed in the "watchNamespace" parameter.
# watchNamespace:
# - n1
# - n2
# -- Environment variables.
env:
# If not set or set to true, kuberay auto injects an init container waiting for ray GCS.
# If false, you will need to inject your own init container to ensure ray GCS is up before the ray workers start.
# Warning: we highly recommend setting to true and let kuberay handle for you.
# - name: ENABLE_INIT_CONTAINER_INJECTION
# value: "true"
# If set to true, kuberay creates a normal ClusterIP service for a Ray Head instead of a Headless service. Default to false.
# - name: ENABLE_RAY_HEAD_CLUSTER_IP_SERVICE
# value: "false"
# If not set or set to "", kuberay will pick up the default k8s cluster domain `cluster.local`
# Otherwise, kuberay will use your custom domain
# - name: CLUSTER_DOMAIN
# value: ""
# If not set or set to false, when running on OpenShift with Ingress creation enabled, kuberay will create OpenShift route
# Otherwise, regardless of the type of cluster with Ingress creation enabled, kuberay will create Ingress
# - name: USE_INGRESS_ON_OPENSHIFT
# value: "true"
# Unconditionally requeue after the number of seconds specified in the
# environment variable RAYCLUSTER_DEFAULT_REQUEUE_SECONDS_ENV. If the
# environment variable is not set, requeue after the default value (300).
# - name: RAYCLUSTER_DEFAULT_REQUEUE_SECONDS_ENV
# value: 300
# If not set or set to "true", KubeRay will clean up the Redis storage namespace when a GCS FT-enabled RayCluster is deleted.
# - name: ENABLE_GCS_FT_REDIS_CLEANUP
# value: "true"
# For LLM serving, some users might not have sufficient GPU resources to run two RayClusters simultaneously.
# Therefore, KubeRay offers ENABLE_ZERO_DOWNTIME as a feature flag for zero-downtime upgrades.
# - name: ENABLE_ZERO_DOWNTIME
# value: "true"
# This environment variable for the KubeRay operator is used to determine whether to enable
# the injection of readiness and liveness probes into Ray head and worker containers.
# Enabling this feature contributes to the robustness of Ray clusters.
# - name: ENABLE_PROBES_INJECTION
# value: "true"
# If set to true, the RayJob CR itself will be deleted if shutdownAfterJobFinishes is set to true. Note that all resources created by the RayJob CR will be deleted, including the K8s Job. Otherwise, only the RayCluster CR will be deleted. Default is false.
# - name: DELETE_RAYJOB_CR_AFTER_JOB_FINISHES
# value: "false"
# If set to true, we will use deterministic name for head pod. Otherwise, the non-deterministic name is used.
# - name: ENABLE_DETERMINISTIC_HEAD_POD_NAME
# value: "false"
# This environment variable determines whether to enable a login shell by passing the -l option to the container command /bin/bash.
# The -l flag was added by default before KubeRay v1.4.0, but it is no longer added by default starting with v1.4.0.
# - name: ENABLE_LOGIN_SHELL
# value: "true"
# This KubeRay operator environment variable is used to determine if random Pod
# deletion should be enabled. Note that this only takes effect when autoscaling
# is enabled for the RayCluster.
# - name: ENABLE_RANDOM_POD_DELETE
# value: "false"
# If JobDeploymentStatus does not transition to Complete or Failed within
# this grace period seconds after JobStatus reaches a terminal state,
# KubeRay will update JobDeploymentStatus directly.
# - name: RAYJOB_DEPLOYMENT_STATUS_TRANSITION_GRACE_PERIOD_SECONDS
# value: "300"
# -- Resource requests and limits for containers.
resources:
limits:
cpu: 100m
# Anecdotally, managing 500 Ray pods requires roughly 500MB memory.
# Monitor memory usage and adjust as needed.
memory: 512Mi
# requests:
# cpu: 100m
# memory: 512Mi
# @Ignore -- Pod liveness probe configuration.
livenessProbe:
initialDelaySeconds: 10
periodSeconds: 5
failureThreshold: 5
# @Ignore -- Pod readiness probe configuration.
readinessProbe:
initialDelaySeconds: 10
periodSeconds: 5
failureThreshold: 5
# -- Set up `securityContext` to improve Pod security.
podSecurityContext: {}
# @ignore -- Set up `securityContext` to improve container security.
securityContext:
allowPrivilegeEscalation: false
readOnlyRootFilesystem: true
capabilities:
drop:
- ALL
runAsNonRoot: true
seccompProfile:
type: RuntimeDefault
service:
# -- Service type.
type: ClusterIP
# -- Service port.
port: 8080