22{{$NODES_PER_NAMESPACE := MinInt .Nodes (DefaultParam .CL2_NODES_PER_NAMESPACE 100)}}
33{{$LOAD_TEST_THROUGHPUT := DefaultParam .CL2_LOAD_TEST_THROUGHPUT 10}}
44{{$STEADY_STATE_QPS := DefaultParam .CL2_STEADY_STATE_QPS 5}}
5+ {{$RESOURCE_SLICES_PER_NODE := DefaultParam .CL2_RESOURCE_SLICES_PER_NODE 1}}
6+ {{$UPSIZE_THRESHOLD := DefaultParam .CL2_UPSIZE_THRESHOLD "5s"}}
7+ {{$UPSIZE_PERC50_THRESHOLD := DefaultParam .CL2_UPSIZE_PERC50_THRESHOLD "5s"}}
8+ {{$UPSIZE_PERC90_THRESHOLD := DefaultParam .CL2_UPSIZE_PERC90_THRESHOLD "5s"}}
9+ {{$CHURN_POD_STARTUP_PERC50_THRESHOLD := DefaultParam .CL2_CHURN_POD_STARTUP_PERC50_THRESHOLD "5s"}}
10+ {{$CHURN_POD_STARTUP_PERC90_THRESHOLD := DefaultParam .CL2_CHURN_POD_STARTUP_PERC90_THRESHOLD "5s"}}
11+ {{$CHURN_POD_STARTUP_PERC99_THRESHOLD := DefaultParam .CL2_CHURN_POD_STARTUP_PERC99_THRESHOLD "5s"}}
12+ {{$FINISHED_JOBS_THRESHOLD := DefaultParam .CL2_FINISHED_JOBS_THRESHOLD "10m"}}
13+ {{$RUNNING_JOBS_THRESHOLD := DefaultParam .CL2_RUNNING_JOBS_THRESHOLD "10m"}}
14+ {{$RUNNING_JOBS_OPERATION_THRESHOLD := DefaultParam .CL2_RUNNING_JOBS_OPERATION_THRESHOLD "120s"}}
515{{$token := .CL2_TOKEN }}
616
717{{$namespaces := DivideInt .Nodes $NODES_PER_NAMESPACE}}
818
19+ # dra
20+ {{$draManifests := DefaultParam .CL2_DRA_MANIFESTS "dra-example-driver"}}
21+ {{$draDaemonsetName := DefaultParam .CL2_DRA_DAEMONSET_NAME "dra-example-driver-kubeletplugin"}}
22+
923# Node resource configuration
1024{{$gpusPerNode := DefaultParam .CL2_GPUS_PER_NODE 8}}
25+ {{$resourceSlicesPerNode := DefaultParam .CL2_RESOURCE_SLICES_PER_NODE 1}}
26+ {{$totalResourceSliceCount := MultiplyInt $resourceSlicesPerNode .Nodes}}
1127{{$totalGPUs := MultiplyInt $gpusPerNode .Nodes}}
1228
1329# fast fill job configuration - for initial fill up
1935
2036# churn job configuration for steady state
2137{{$smallJobPodsCount := SubtractInt $totalGPUs (MultiplyInt $fillPodsPerNamespace $namespaces)}}
22- {{$smallJobsPerNamespace := DivideInt $smallJobPodsCount $namespaces}}
38+ {{$calculatedSJPN := DivideInt $smallJobPodsCount $namespaces}}
39+ {{$maxSJPN := DefaultParam .CL2_MAX_SMALL_JOBS_PER_NAMESPACE 999999}}
40+ {{$smallJobsPerNamespace := MinInt $calculatedSJPN $maxSJPN}}
2341{{$smallJobSize := 1}}
24- {{$smallJobCompletions := 10}}
42+ {{$smallJobCompletions := DefaultParam .CL2_SMALL_JOB_COMPLETIONS 10}}
2543{{$jobRunningTime := DefaultParam .CL2_JOB_RUNNING_TIME "30s"}}
2644{{$ENABLE_EXTENDED_RESOURCES := DefaultParam .CL2_ENABLE_EXTENDED_RESOURCES false}}
45+ {{$deviceClassName := DefaultParam .CL2_DEVICE_CLASS_NAME "gpu.example.com"}}
2746
2847{{$extendedResourceName := ""}}
2948{{if $ENABLE_EXTENDED_RESOURCES}}
@@ -46,10 +65,12 @@ tuningSets:
4665
4766
4867dependencies :
49- - name : Install dra-example- driver for test
68+ - name : Install dra-driver for test
5069 Method : DRATestDriver
5170 Params :
52- WorkerNodeCount : {{.Nodes}}
71+ WorkerNodeCount : {{$totalResourceSliceCount}}
72+ DaemonsetName : {{$draDaemonsetName}}
73+ Manifests : {{$draManifests}}
5374 {{if $ENABLE_EXTENDED_RESOURCES}}
5475 ExtendedResourceName : {{$extendedResourceName}}
5576 {{end}}
@@ -70,12 +91,15 @@ steps:
7091 apiVersion : batch/v1
7192 kind : Job
7293 labelSelector : job-type = long-running
73- operationTimeout : 120s
94+ operationTimeout : {{$RUNNING_JOBS_OPERATION_THRESHOLD}}
7495 - Identifier : FastFillPodStartupLatency
7596 Method : PodStartupLatency
7697 Params :
7798 action : start
7899 labelSelector : job-type = long-running
100+ perc50Threshold : {{$UPSIZE_PERC50_THRESHOLD}}
101+ perc90Threshold : {{$UPSIZE_PERC90_THRESHOLD}}
102+ threshold : {{$UPSIZE_THRESHOLD}}
79103 - Identifier : FastFillClaimAllocationLatency
80104 Method : ResourceClaimAllocationLatency
81105 Params :
@@ -111,6 +135,8 @@ steps:
111135 objectBundle :
112136 - basename : single-gpu
113137 objectTemplatePath : " resourceclaimtemplate.yaml"
138+ templateFillMap :
139+ DeviceClassName : {{$deviceClassName}}
114140{{end}}
115141- name : Fill cluster to {{$fillPercentage}}% utilization
116142 phases :
@@ -134,7 +160,7 @@ steps:
134160 Params :
135161 action : gather
136162 labelSelector : job-type = long-running
137- timeout : 15m
163+ timeout : {{$RUNNING_JOBS_THRESHOLD}}
138164- name : Gather measurements for long running pods
139165 measurements :
140166 - Identifier : FastFillSchedulingMetrics
@@ -145,6 +171,9 @@ steps:
145171 Method : PodStartupLatency
146172 Params :
147173 action : gather
174+ perc50Threshold : {{$UPSIZE_PERC50_THRESHOLD}}
175+ perc90Threshold : {{$UPSIZE_PERC90_THRESHOLD}}
176+ threshold : {{$UPSIZE_THRESHOLD}}
148177 - Identifier : FastFillClaimAllocationLatency
149178 Method : ResourceClaimAllocationLatency
150179 Params :
@@ -164,9 +193,9 @@ steps:
164193 Params :
165194 action : start
166195 labelSelector : job-type = short-lived
167- perc50Threshold : 40s
168- perc90Threshold : 60s
169- perc99Threshold : 80s
196+ perc50Threshold : {{$CHURN_POD_STARTUP_PERC50_THRESHOLD}}
197+ perc90Threshold : {{$CHURN_POD_STARTUP_PERC90_THRESHOLD}}
198+ perc99Threshold : {{$CHURN_POD_STARTUP_PERC99_THRESHOLD}}
170199 - Identifier : ChurnClaimAllocationLatency
171200 Method : ResourceClaimAllocationLatency
172201 Params :
@@ -210,7 +239,7 @@ steps:
210239 Params :
211240 action : gather
212241 labelSelector : job-type = short-lived
213- timeout : 15m
242+ timeout : {{$FINISHED_JOBS_THRESHOLD}}
214243- name : Measure scheduler metrics
215244 measurements :
216245 - Identifier : ChurnSchedulingMetrics
@@ -221,14 +250,14 @@ steps:
221250 Method : PodStartupLatency
222251 Params :
223252 action : gather
224- perc50Threshold : 40s
225- perc90Threshold : 60s
226- perc99Threshold : 80s
253+ perc50Threshold : {{$CHURN_POD_STARTUP_PERC50_THRESHOLD}}
254+ perc90Threshold : {{$CHURN_POD_STARTUP_PERC90_THRESHOLD}}
255+ perc99Threshold : {{$CHURN_POD_STARTUP_PERC99_THRESHOLD}}
227256 - Identifier : ChurnClaimAllocationLatency
228257 Method : ResourceClaimAllocationLatency
229258 Params :
230259 action : gather
231260 - Identifier : ChurnDRAMetrics
232261 Method : GenericPrometheusQuery
233262 Params :
234- action : gather
263+ action : gather
0 commit comments