Skip to content

Commit cba8b3d

Browse files
committed
Parametrize DRA test config
1 parent d57e351 commit cba8b3d

File tree

4 files changed

+47
-22
lines changed

4 files changed

+47
-22
lines changed

clusterloader2/pkg/dependency/dra/dra.go

Lines changed: 0 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -167,12 +167,6 @@ func (d *draDependency) isDRADriverReady(config *dependency.Config, daemonsetNam
167167
}
168168

169169
func isResourceSlicesPublished(config *dependency.Config, namespace string) (bool, error) {
170-
// Get a list of all nodes
171-
// nodes, err := getReadyNodesCount(config)
172-
// if err != nil {
173-
// return false, fmt.Errorf("failed to list nodes: %v", err)
174-
// }
175-
176170
driverPluginPods, err := getDriverPluginPods(config, namespace, draDaemonsetName)
177171
if err != nil {
178172
return false, fmt.Errorf("failed to list driverPluginPods: %v", err)

clusterloader2/testing/dra/config.yaml

Lines changed: 43 additions & 14 deletions
Original file line numberDiff line numberDiff line change
@@ -2,12 +2,28 @@
22
{{$NODES_PER_NAMESPACE := MinInt .Nodes (DefaultParam .CL2_NODES_PER_NAMESPACE 100)}}
33
{{$LOAD_TEST_THROUGHPUT := DefaultParam .CL2_LOAD_TEST_THROUGHPUT 10}}
44
{{$STEADY_STATE_QPS := DefaultParam .CL2_STEADY_STATE_QPS 5}}
5+
{{$RESOURCE_SLICES_PER_NODE := DefaultParam .CL2_RESOURCE_SLICES_PER_NODE 1}}
6+
{{$UPSIZE_THRESHOLD := DefaultParam .CL2_UPSIZE_THRESHOLD "5s"}}
7+
{{$UPSIZE_PERC50_THRESHOLD := DefaultParam .CL2_UPSIZE_PERC50_THRESHOLD "5s"}}
8+
{{$UPSIZE_PERC90_THRESHOLD := DefaultParam .CL2_UPSIZE_PERC90_THRESHOLD "5s"}}
9+
{{$CHURN_POD_STARTUP_PERC50_THRESHOLD := DefaultParam .CL2_CHURN_POD_STARTUP_PERC50_THRESHOLD "5s"}}
10+
{{$CHURN_POD_STARTUP_PERC90_THRESHOLD := DefaultParam .CL2_CHURN_POD_STARTUP_PERC90_THRESHOLD "5s"}}
11+
{{$CHURN_POD_STARTUP_PERC99_THRESHOLD := DefaultParam .CL2_CHURN_POD_STARTUP_PERC99_THRESHOLD "5s"}}
12+
{{$FINISHED_JOBS_THRESHOLD := DefaultParam .CL2_FINISHED_JOBS_THRESHOLD "10m"}}
13+
{{$RUNNING_JOBS_THRESHOLD := DefaultParam .CL2_RUNNING_JOBS_THRESHOLD "10m"}}
14+
{{$RUNNING_JOBS_OPERATION_THRESHOLD := DefaultParam .CL2_RUNNING_JOBS_OPERATION_THRESHOLD "120s"}}
515
{{$token := .CL2_TOKEN }}
616

717
{{$namespaces := DivideInt .Nodes $NODES_PER_NAMESPACE}}
818

19+
# dra
20+
{{$draManifests := DefaultParam .CL2_DRA_MANIFESTS "dra-example-driver"}}
21+
{{$draDaemonsetName := DefaultParam .CL2_DRA_DAEMONSET_NAME "dra-example-driver-kubeletplugin"}}
22+
923
# Node resource configuration
1024
{{$gpusPerNode := DefaultParam .CL2_GPUS_PER_NODE 8}}
25+
{{$resourceSlicesPerNode := DefaultParam .CL2_RESOURCE_SLICES_PER_NODE 1}}
26+
{{$totalResourceSliceCount := MultiplyInt $resourceSlicesPerNode .Nodes}}
1127
{{$totalGPUs := MultiplyInt $gpusPerNode .Nodes}}
1228

1329
# fast fill job configuration - for initial fill up
@@ -19,11 +35,14 @@
1935

2036
# churn job configuration for steady state
2137
{{$smallJobPodsCount := SubtractInt $totalGPUs (MultiplyInt $fillPodsPerNamespace $namespaces)}}
22-
{{$smallJobsPerNamespace := DivideInt $smallJobPodsCount $namespaces}}
38+
{{$calculatedSJPN := DivideInt $smallJobPodsCount $namespaces}}
39+
{{$maxSJPN := DefaultParam .CL2_MAX_SMALL_JOBS_PER_NAMESPACE 999999}}
40+
{{$smallJobsPerNamespace := MinInt $calculatedSJPN $maxSJPN}}
2341
{{$smallJobSize := 1}}
24-
{{$smallJobCompletions := 10}}
42+
{{$smallJobCompletions := DefaultParam .CL2_SMALL_JOB_COMPLETIONS 10}}
2543
{{$jobRunningTime := DefaultParam .CL2_JOB_RUNNING_TIME "30s"}}
2644
{{$ENABLE_EXTENDED_RESOURCES := DefaultParam .CL2_ENABLE_EXTENDED_RESOURCES false}}
45+
{{$deviceClassName := DefaultParam .CL2_DEVICE_CLASS_NAME "gpu.example.com"}}
2746

2847
{{$extendedResourceName := ""}}
2948
{{if $ENABLE_EXTENDED_RESOURCES}}
@@ -46,10 +65,12 @@ tuningSets:
4665

4766

4867
dependencies:
49-
- name: Install dra-example-driver for test
68+
- name: Install dra-driver for test
5069
Method: DRATestDriver
5170
Params:
52-
WorkerNodeCount: {{.Nodes}}
71+
WorkerNodeCount: {{$totalResourceSliceCount}}
72+
DaemonsetName: {{$draDaemonsetName}}
73+
Manifests: {{$draManifests}}
5374
{{if $ENABLE_EXTENDED_RESOURCES}}
5475
ExtendedResourceName: {{$extendedResourceName}}
5576
{{end}}
@@ -70,12 +91,15 @@ steps:
7091
apiVersion: batch/v1
7192
kind: Job
7293
labelSelector: job-type = long-running
73-
operationTimeout: 120s
94+
operationTimeout: {{$RUNNING_JOBS_OPERATION_THRESHOLD}}
7495
- Identifier: FastFillPodStartupLatency
7596
Method: PodStartupLatency
7697
Params:
7798
action: start
7899
labelSelector: job-type = long-running
100+
perc50Threshold: {{$UPSIZE_PERC50_THRESHOLD}}
101+
perc90Threshold: {{$UPSIZE_PERC90_THRESHOLD}}
102+
threshold: {{$UPSIZE_THRESHOLD}}
79103
- Identifier: FastFillClaimAllocationLatency
80104
Method: ResourceClaimAllocationLatency
81105
Params:
@@ -111,6 +135,8 @@ steps:
111135
objectBundle:
112136
- basename: single-gpu
113137
objectTemplatePath: "resourceclaimtemplate.yaml"
138+
templateFillMap:
139+
DeviceClassName: {{$deviceClassName}}
114140
{{end}}
115141
- name: Fill cluster to {{$fillPercentage}}% utilization
116142
phases:
@@ -134,7 +160,7 @@ steps:
134160
Params:
135161
action: gather
136162
labelSelector: job-type = long-running
137-
timeout: 15m
163+
timeout: {{$RUNNING_JOBS_THRESHOLD}}
138164
- name: Gather measurements for long running pods
139165
measurements:
140166
- Identifier: FastFillSchedulingMetrics
@@ -145,6 +171,9 @@ steps:
145171
Method: PodStartupLatency
146172
Params:
147173
action: gather
174+
perc50Threshold: {{$UPSIZE_PERC50_THRESHOLD}}
175+
perc90Threshold: {{$UPSIZE_PERC90_THRESHOLD}}
176+
threshold: {{$UPSIZE_THRESHOLD}}
148177
- Identifier: FastFillClaimAllocationLatency
149178
Method: ResourceClaimAllocationLatency
150179
Params:
@@ -164,9 +193,9 @@ steps:
164193
Params:
165194
action: start
166195
labelSelector: job-type = short-lived
167-
perc50Threshold: 40s
168-
perc90Threshold: 60s
169-
perc99Threshold: 80s
196+
perc50Threshold: {{$CHURN_POD_STARTUP_PERC50_THRESHOLD}}
197+
perc90Threshold: {{$CHURN_POD_STARTUP_PERC90_THRESHOLD}}
198+
perc99Threshold: {{$CHURN_POD_STARTUP_PERC99_THRESHOLD}}
170199
- Identifier: ChurnClaimAllocationLatency
171200
Method: ResourceClaimAllocationLatency
172201
Params:
@@ -210,7 +239,7 @@ steps:
210239
Params:
211240
action: gather
212241
labelSelector: job-type = short-lived
213-
timeout: 15m
242+
timeout: {{$FINISHED_JOBS_THRESHOLD}}
214243
- name: Measure scheduler metrics
215244
measurements:
216245
- Identifier: ChurnSchedulingMetrics
@@ -221,14 +250,14 @@ steps:
221250
Method: PodStartupLatency
222251
Params:
223252
action: gather
224-
perc50Threshold: 40s
225-
perc90Threshold: 60s
226-
perc99Threshold: 80s
253+
perc50Threshold: {{$CHURN_POD_STARTUP_PERC50_THRESHOLD}}
254+
perc90Threshold: {{$CHURN_POD_STARTUP_PERC90_THRESHOLD}}
255+
perc99Threshold: {{$CHURN_POD_STARTUP_PERC99_THRESHOLD}}
227256
- Identifier: ChurnClaimAllocationLatency
228257
Method: ResourceClaimAllocationLatency
229258
Params:
230259
action: gather
231260
- Identifier: ChurnDRAMetrics
232261
Method: GenericPrometheusQuery
233262
Params:
234-
action: gather
263+
action: gather

clusterloader2/testing/dra/job.yaml

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -9,7 +9,9 @@ spec:
99
parallelism: {{.Replicas}}
1010
completions: {{.CompletionReplicas}}
1111
completionMode: {{.Mode}}
12-
ttlSecondsAfterFinished: 300
12+
# In tests involving a large number of sequentially created, short-lived jobs, the spin-up time may be significant.
13+
# A TTL of 1 hour should be sufficient to retain the jobs long enough for measurement checks.
14+
ttlSecondsAfterFinished: 3600 # 1 hour
1315
template:
1416
metadata:
1517
labels:

clusterloader2/testing/dra/resourceclaimtemplate.yaml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -8,4 +8,4 @@ spec:
88
requests:
99
- name: gpu
1010
exactly:
11-
deviceClassName: gpu.example.com
11+
deviceClassName: {{.DeviceClassName}}

0 commit comments

Comments
 (0)