22{{$NODES_PER_NAMESPACE := MinInt .Nodes (DefaultParam .CL2_NODES_PER_NAMESPACE 100)}}
33{{$LOAD_TEST_THROUGHPUT := DefaultParam .CL2_LOAD_TEST_THROUGHPUT 10}}
44{{$STEADY_STATE_QPS := DefaultParam .CL2_STEADY_STATE_QPS 5}}
5+ {{$RESOURCE_SLICES_PER_NODE := DefaultParam .CL2_RESOURCE_SLICES_PER_NODE 1}}
6+ {{$UPSIZE_THRESHOLD := DefaultParam .CL2_UPSIZE_THRESHOLD "10m"}}
7+ {{$UPSIZE_PERC50_THRESHOLD := DefaultParam .CL2_UPSIZE_PERC50_THRESHOLD "40s"}}
8+ {{$UPSIZE_PERC90_THRESHOLD := DefaultParam .CL2_UPSIZE_PERC90_THRESHOLD "4m"}}
9+ {{$CHURN_POD_STARTUP_PERC50_THRESHOLD := DefaultParam .CL2_CHURN_POD_STARTUP_PERC50_THRESHOLD “40s"}}
10+ {{$CHURN_POD_STARTUP_PERC90_THRESHOLD := DefaultParam .CL2_CHURN_POD_STARTUP_PERC90_THRESHOLD “60s”}}
11+ {{$CHURN_POD_STARTUP_PERC99_THRESHOLD := DefaultParam .CL2_CHURN_POD_STARTUP_PERC99_THRESHOLD “80s”}}
12+ {{$FINISHED_JOBS_THRESHOLD := DefaultParam .CL2_FINISHED_JOBS_THRESHOLD "10m"}}
13+ {{$RUNNING_JOBS_THRESHOLD := DefaultParam .CL2_RUNNING_JOBS_THRESHOLD "10m"}}
14+ {{$RUNNING_JOBS_OPERATION_THRESHOLD := DefaultParam .CL2_RUNNING_JOBS_OPERATION_THRESHOLD "120s"}}
515{{$token := .CL2_TOKEN }}
616
717{{$namespaces := DivideInt .Nodes $NODES_PER_NAMESPACE}}
818
19+ # dra
20+ {{$draNamespace := DefaultParam .CL2_DRA_NAMESPACE "dra-example-driver"}}
21+ {{$draManifests := DefaultParam .CL2_DRA_MANIFESTS "example"}}
22+ {{$draDaemonsetName := DefaultParam .CL2_DRA_DAEMONSET_NAME "dra-example-driver-kubeletplugin"}}
23+
924# Node resource configuration
1025{{$gpusPerNode := DefaultParam .CL2_GPUS_PER_NODE 8}}
26+ {{$resourceSlicesPerNode := DefaultParam .CL2_RESOURCE_SLICES_PER_NODE 1}}
27+ {{$workerNodeCount := MultiplyInt $resourceSlicesPerNode .Nodes}}
1128{{$totalGPUs := MultiplyInt $gpusPerNode .Nodes}}
1229
1330# fast fill job configuration - for initial fill up
1936
2037# churn job configuration for steady state
2138{{$smallJobPodsCount := SubtractInt $totalGPUs (MultiplyInt $fillPodsPerNamespace $namespaces)}}
22- {{$smallJobsPerNamespace := DivideInt $smallJobPodsCount $namespaces}}
39+ {{$calculatedSJPN := DivideInt $smallJobPodsCount $namespaces}}
40+ {{$maxSJPN := DefaultParam .CL2_MAX_SMALL_JOBS_PER_NAMESPACE 999999}}
41+ {{$smallJobsPerNamespace := MinInt $calculatedSJPN $maxSJPN}}
2342{{$smallJobSize := 1}}
24- {{$smallJobCompletions := 10}}
43+ {{$smallJobCompletions := DefaultParam .CL2_SMALL_JOB_COMPLETIONS 10}}
2544{{$jobRunningTime := DefaultParam .CL2_JOB_RUNNING_TIME "30s"}}
2645
2746name : dra-steady-state
@@ -38,10 +57,13 @@ tuningSets:
3857 qps : {{$STEADY_STATE_QPS}}
3958
4059dependencies :
41- - name : Install dra-example- driver for test
60+ - name : Install dra-driver for test
4261 Method : DRATestDriver
4362 Params :
44- WorkerNodeCount : {{.Nodes}}
63+ WorkerNodeCount : {{$workerNodeCount}}
64+ Namespace : {{$draNamespace}}
65+ DaemonsetName : {{$draDaemonsetName}}
66+ Manifests : {{$draManifests}}
4567 Timeout : 5m
4668
4769steps :
@@ -59,12 +81,15 @@ steps:
5981 apiVersion : batch/v1
6082 kind : Job
6183 labelSelector : job-type = long-running
62- operationTimeout : 120s
84+ operationTimeout : {{$RUNNING_JOBS_OPERATION_THRESHOLD}}
6385 - Identifier : FastFillPodStartupLatency
6486 Method : PodStartupLatency
6587 Params :
6688 action : start
6789 labelSelector : job-type = long-running
90+ perc50Threshold : {{$UPSIZE_PERC50_THRESHOLD}}
91+ perc90Threshold : {{$UPSIZE_PERC90_THRESHOLD}}
92+ threshold : {{$UPSIZE_THRESHOLD}}
6893 - Identifier : FastFillClaimAllocationLatency
6994 Method : ResourceClaimAllocationLatency
7095 Params :
@@ -98,7 +123,10 @@ steps:
98123 tuningSet : FastFill
99124 objectBundle :
100125 - basename : single-gpu
126+ # Add other resourceclaimtemplates for different drivers
127+ {{if eq $draManifests "example"}}
101128 objectTemplatePath : " resourceclaimtemplate.yaml"
129+ {{end}}
102130- name : Fill cluster to {{$fillPercentage}}% utilization
103131 phases :
104132 - namespaceRange :
@@ -120,7 +148,7 @@ steps:
120148 Params :
121149 action : gather
122150 labelSelector : job-type = long-running
123- timeout : 15m
151+ timeout : {{$RUNNING_JOBS_THRESHOLD}}
124152- name : Gather measurements for long running pods
125153 measurements :
126154 - Identifier : FastFillSchedulingMetrics
@@ -131,6 +159,9 @@ steps:
131159 Method : PodStartupLatency
132160 Params :
133161 action : gather
162+ perc50Threshold : {{$UPSIZE_PERC50_THRESHOLD}}
163+ perc90Threshold : {{$UPSIZE_PERC90_THRESHOLD}}
164+ threshold : {{$UPSIZE_THRESHOLD}}
134165 - Identifier : FastFillClaimAllocationLatency
135166 Method : ResourceClaimAllocationLatency
136167 Params :
@@ -150,9 +181,9 @@ steps:
150181 Params :
151182 action : start
152183 labelSelector : job-type = short-lived
153- perc50Threshold : 40s
154- perc90Threshold : 60s
155- perc99Threshold : 80s
184+ perc50Threshold : {{$CHURN_POD_STARTUP_PERC50_THRESHOLD}}
185+ perc90Threshold : {{$CHURN_POD_STARTUP_PERC90_THRESHOLD}}
186+ perc99Threshold : {{$CHURN_POD_STARTUP_PERC99_THRESHOLD}}
156187 - Identifier : ChurnClaimAllocationLatency
157188 Method : ResourceClaimAllocationLatency
158189 Params :
@@ -195,7 +226,7 @@ steps:
195226 Params :
196227 action : gather
197228 labelSelector : job-type = short-lived
198- timeout : 15m
229+ timeout : {{$FINISHED_JOBS_THRESHOLD}}
199230- name : Measure scheduler metrics
200231 measurements :
201232 - Identifier : ChurnSchedulingMetrics
@@ -206,14 +237,14 @@ steps:
206237 Method : PodStartupLatency
207238 Params :
208239 action : gather
209- perc50Threshold : 40s
210- perc90Threshold : 60s
211- perc99Threshold : 80s
240+ perc50Threshold : {{$CHURN_POD_STARTUP_PERC50_THRESHOLD}}
241+ perc90Threshold : {{$CHURN_POD_STARTUP_PERC90_THRESHOLD}}
242+ perc99Threshold : {{$CHURN_POD_STARTUP_PERC99_THRESHOLD}}
212243 - Identifier : ChurnClaimAllocationLatency
213244 Method : ResourceClaimAllocationLatency
214245 Params :
215246 action : gather
216247 - Identifier : ChurnDRAMetrics
217248 Method : GenericPrometheusQuery
218249 Params :
219- action : gather
250+ action : gather
0 commit comments