Skip to content

Commit 65b6445

Browse files
committed
Parametrize DRA test config
1 parent c51aeb1 commit 65b6445

File tree

2 files changed

+48
-15
lines changed

2 files changed

+48
-15
lines changed

clusterloader2/testing/dra/config.yaml

Lines changed: 45 additions & 14 deletions
Original file line numberDiff line numberDiff line change
@@ -2,12 +2,29 @@
22
{{$NODES_PER_NAMESPACE := MinInt .Nodes (DefaultParam .CL2_NODES_PER_NAMESPACE 100)}}
33
{{$LOAD_TEST_THROUGHPUT := DefaultParam .CL2_LOAD_TEST_THROUGHPUT 10}}
44
{{$STEADY_STATE_QPS := DefaultParam .CL2_STEADY_STATE_QPS 5}}
5+
{{$RESOURCE_SLICES_PER_NODE := DefaultParam .CL2_RESOURCE_SLICES_PER_NODE 1}}
6+
{{$UPSIZE_THRESHOLD := DefaultParam .CL2_UPSIZE_THRESHOLD "10m"}}
7+
{{$UPSIZE_PERC50_THRESHOLD := DefaultParam .CL2_UPSIZE_PERC50_THRESHOLD "40s"}}
8+
{{$UPSIZE_PERC90_THRESHOLD := DefaultParam .CL2_UPSIZE_PERC90_THRESHOLD "4m"}}
9+
{{$CHURN_POD_STARTUP_PERC50_THRESHOLD := DefaultParam .CL2_CHURN_POD_STARTUP_PERC50_THRESHOLD “40s"}}
10+
{{$CHURN_POD_STARTUP_PERC90_THRESHOLD := DefaultParam .CL2_CHURN_POD_STARTUP_PERC90_THRESHOLD “60s”}}
11+
{{$CHURN_POD_STARTUP_PERC99_THRESHOLD := DefaultParam .CL2_CHURN_POD_STARTUP_PERC99_THRESHOLD “80s”}}
12+
{{$FINISHED_JOBS_THRESHOLD := DefaultParam .CL2_FINISHED_JOBS_THRESHOLD "10m"}}
13+
{{$RUNNING_JOBS_THRESHOLD := DefaultParam .CL2_RUNNING_JOBS_THRESHOLD "10m"}}
14+
{{$RUNNING_JOBS_OPERATION_THRESHOLD := DefaultParam .CL2_RUNNING_JOBS_OPERATION_THRESHOLD "120s"}}
515
{{$token := .CL2_TOKEN }}
616

717
{{$namespaces := DivideInt .Nodes $NODES_PER_NAMESPACE}}
818

19+
# dra
20+
{{$draNamespace := DefaultParam .CL2_DRA_NAMESPACE "dra-example-driver"}}
21+
{{$draManifests := DefaultParam .CL2_DRA_MANIFESTS "example"}}
22+
{{$draDaemonsetName := DefaultParam .CL2_DRA_DAEMONSET_NAME "dra-example-driver-kubeletplugin"}}
23+
924
# Node resource configuration
1025
{{$gpusPerNode := DefaultParam .CL2_GPUS_PER_NODE 8}}
26+
{{$resourceSlicesPerNode := DefaultParam .CL2_RESOURCE_SLICES_PER_NODE 1}}
27+
{{$workerNodeCount := MultiplyInt $resourceSlicesPerNode .Nodes}}
1128
{{$totalGPUs := MultiplyInt $gpusPerNode .Nodes}}
1229

1330
# fast fill job configuration - for initial fill up
@@ -19,9 +36,11 @@
1936

2037
# churn job configuration for steady state
2138
{{$smallJobPodsCount := SubtractInt $totalGPUs (MultiplyInt $fillPodsPerNamespace $namespaces)}}
22-
{{$smallJobsPerNamespace := DivideInt $smallJobPodsCount $namespaces}}
39+
{{$calculatedSJPN := DivideInt $smallJobPodsCount $namespaces}}
40+
{{$maxSJPN := DefaultParam .CL2_MAX_SMALL_JOBS_PER_NAMESPACE 999999}}
41+
{{$smallJobsPerNamespace := MinInt $calculatedSJPN $maxSJPN}}
2342
{{$smallJobSize := 1}}
24-
{{$smallJobCompletions := 10}}
43+
{{$smallJobCompletions := DefaultParam .CL2_SMALL_JOB_COMPLETIONS 10}}
2544
{{$jobRunningTime := DefaultParam .CL2_JOB_RUNNING_TIME "30s"}}
2645

2746
name: dra-steady-state
@@ -38,10 +57,13 @@ tuningSets:
3857
qps: {{$STEADY_STATE_QPS}}
3958

4059
dependencies:
41-
- name: Install dra-example-driver for test
60+
- name: Install dra-driver for test
4261
Method: DRATestDriver
4362
Params:
44-
WorkerNodeCount: {{.Nodes}}
63+
WorkerNodeCount: {{$workerNodeCount}}
64+
Namespace: {{$draNamespace}}
65+
DaemonsetName: {{$draDaemonsetName}}
66+
Manifests: {{$draManifests}}
4567
Timeout: 5m
4668

4769
steps:
@@ -59,12 +81,15 @@ steps:
5981
apiVersion: batch/v1
6082
kind: Job
6183
labelSelector: job-type = long-running
62-
operationTimeout: 120s
84+
operationTimeout: {{$RUNNING_JOBS_OPERATION_THRESHOLD}}
6385
- Identifier: FastFillPodStartupLatency
6486
Method: PodStartupLatency
6587
Params:
6688
action: start
6789
labelSelector: job-type = long-running
90+
perc50Threshold: {{$UPSIZE_PERC50_THRESHOLD}}
91+
perc90Threshold: {{$UPSIZE_PERC90_THRESHOLD}}
92+
threshold: {{$UPSIZE_THRESHOLD}}
6893
- Identifier: FastFillClaimAllocationLatency
6994
Method: ResourceClaimAllocationLatency
7095
Params:
@@ -98,7 +123,10 @@ steps:
98123
tuningSet: FastFill
99124
objectBundle:
100125
- basename: single-gpu
126+
# Add other resourceclaimtemplates for different drivers
127+
{{if eq $draManifests "example"}}
101128
objectTemplatePath: "resourceclaimtemplate.yaml"
129+
{{end}}
102130
- name: Fill cluster to {{$fillPercentage}}% utilization
103131
phases:
104132
- namespaceRange:
@@ -120,7 +148,7 @@ steps:
120148
Params:
121149
action: gather
122150
labelSelector: job-type = long-running
123-
timeout: 15m
151+
timeout: {{$RUNNING_JOBS_THRESHOLD}}
124152
- name: Gather measurements for long running pods
125153
measurements:
126154
- Identifier: FastFillSchedulingMetrics
@@ -131,6 +159,9 @@ steps:
131159
Method: PodStartupLatency
132160
Params:
133161
action: gather
162+
perc50Threshold: {{$UPSIZE_PERC50_THRESHOLD}}
163+
perc90Threshold: {{$UPSIZE_PERC90_THRESHOLD}}
164+
threshold: {{$UPSIZE_THRESHOLD}}
134165
- Identifier: FastFillClaimAllocationLatency
135166
Method: ResourceClaimAllocationLatency
136167
Params:
@@ -150,9 +181,9 @@ steps:
150181
Params:
151182
action: start
152183
labelSelector: job-type = short-lived
153-
perc50Threshold: 40s
154-
perc90Threshold: 60s
155-
perc99Threshold: 80s
184+
perc50Threshold: {{$CHURN_POD_STARTUP_PERC50_THRESHOLD}}
185+
perc90Threshold: {{$CHURN_POD_STARTUP_PERC90_THRESHOLD}}
186+
perc99Threshold: {{$CHURN_POD_STARTUP_PERC99_THRESHOLD}}
156187
- Identifier: ChurnClaimAllocationLatency
157188
Method: ResourceClaimAllocationLatency
158189
Params:
@@ -195,7 +226,7 @@ steps:
195226
Params:
196227
action: gather
197228
labelSelector: job-type = short-lived
198-
timeout: 15m
229+
timeout: {{$FINISHED_JOBS_THRESHOLD}}
199230
- name: Measure scheduler metrics
200231
measurements:
201232
- Identifier: ChurnSchedulingMetrics
@@ -206,14 +237,14 @@ steps:
206237
Method: PodStartupLatency
207238
Params:
208239
action: gather
209-
perc50Threshold: 40s
210-
perc90Threshold: 60s
211-
perc99Threshold: 80s
240+
perc50Threshold: {{$CHURN_POD_STARTUP_PERC50_THRESHOLD}}
241+
perc90Threshold: {{$CHURN_POD_STARTUP_PERC90_THRESHOLD}}
242+
perc99Threshold: {{$CHURN_POD_STARTUP_PERC99_THRESHOLD}}
212243
- Identifier: ChurnClaimAllocationLatency
213244
Method: ResourceClaimAllocationLatency
214245
Params:
215246
action: gather
216247
- Identifier: ChurnDRAMetrics
217248
Method: GenericPrometheusQuery
218249
Params:
219-
action: gather
250+
action: gather

clusterloader2/testing/dra/job.yaml

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -9,7 +9,9 @@ spec:
99
parallelism: {{.Replicas}}
1010
completions: {{.CompletionReplicas}}
1111
completionMode: {{.Mode}}
12-
ttlSecondsAfterFinished: 300
12+
# In tests involving a large number of sequentially created, short-lived jobs, the spin-up time may be significant.
13+
# A TTL of 1 hour should be sufficient to retain the jobs long enough for measurement checks.
14+
ttlSecondsAfterFinished: 3600 # 1 hour
1315
template:
1416
metadata:
1517
labels:

0 commit comments

Comments
 (0)