Parametrize DRA test config

emerbe · emerbe · commit cba8b3d16aa8 · 2025-11-05T12:53:42.000Z
diff --git a/clusterloader2/pkg/dependency/dra/dra.go b/clusterloader2/pkg/dependency/dra/dra.go
@@ -167,12 +167,6 @@ func (d *draDependency) isDRADriverReady(config *dependency.Config, daemonsetNam
 }
 
 func isResourceSlicesPublished(config *dependency.Config, namespace string) (bool, error) {
-	// Get a list of all nodes
-	// nodes, err := getReadyNodesCount(config)
-	// if err != nil {
-	// 	return false, fmt.Errorf("failed to list nodes: %v", err)
-	// }
-
 	driverPluginPods, err := getDriverPluginPods(config, namespace, draDaemonsetName)
 	if err != nil {
 		return false, fmt.Errorf("failed to list driverPluginPods: %v", err)
diff --git a/clusterloader2/testing/dra/config.yaml b/clusterloader2/testing/dra/config.yaml
@@ -2,12 +2,28 @@
 {{$NODES_PER_NAMESPACE := MinInt .Nodes (DefaultParam .CL2_NODES_PER_NAMESPACE 100)}}
 {{$LOAD_TEST_THROUGHPUT := DefaultParam .CL2_LOAD_TEST_THROUGHPUT 10}}
 {{$STEADY_STATE_QPS := DefaultParam .CL2_STEADY_STATE_QPS 5}}
+{{$RESOURCE_SLICES_PER_NODE := DefaultParam .CL2_RESOURCE_SLICES_PER_NODE 1}}
+{{$UPSIZE_THRESHOLD := DefaultParam .CL2_UPSIZE_THRESHOLD "5s"}}
+{{$UPSIZE_PERC50_THRESHOLD := DefaultParam .CL2_UPSIZE_PERC50_THRESHOLD "5s"}}
+{{$UPSIZE_PERC90_THRESHOLD := DefaultParam .CL2_UPSIZE_PERC90_THRESHOLD "5s"}}
+{{$CHURN_POD_STARTUP_PERC50_THRESHOLD := DefaultParam .CL2_CHURN_POD_STARTUP_PERC50_THRESHOLD "5s"}}
+{{$CHURN_POD_STARTUP_PERC90_THRESHOLD := DefaultParam .CL2_CHURN_POD_STARTUP_PERC90_THRESHOLD "5s"}}
+{{$CHURN_POD_STARTUP_PERC99_THRESHOLD := DefaultParam .CL2_CHURN_POD_STARTUP_PERC99_THRESHOLD "5s"}}
+{{$FINISHED_JOBS_THRESHOLD := DefaultParam .CL2_FINISHED_JOBS_THRESHOLD "10m"}}
+{{$RUNNING_JOBS_THRESHOLD := DefaultParam .CL2_RUNNING_JOBS_THRESHOLD "10m"}}
+{{$RUNNING_JOBS_OPERATION_THRESHOLD := DefaultParam .CL2_RUNNING_JOBS_OPERATION_THRESHOLD "120s"}}
 {{$token := .CL2_TOKEN }}
 
 {{$namespaces := DivideInt .Nodes $NODES_PER_NAMESPACE}}
 
+# dra
+{{$draManifests := DefaultParam .CL2_DRA_MANIFESTS "dra-example-driver"}}
+{{$draDaemonsetName := DefaultParam .CL2_DRA_DAEMONSET_NAME "dra-example-driver-kubeletplugin"}}
+
 # Node resource configuration
 {{$gpusPerNode := DefaultParam .CL2_GPUS_PER_NODE 8}}
+{{$resourceSlicesPerNode := DefaultParam .CL2_RESOURCE_SLICES_PER_NODE 1}}
+{{$totalResourceSliceCount := MultiplyInt $resourceSlicesPerNode .Nodes}}
 {{$totalGPUs := MultiplyInt $gpusPerNode .Nodes}}
 
 # fast fill job configuration - for initial fill up
@@ -19,11 +35,14 @@
 
 # churn job configuration for steady state
 {{$smallJobPodsCount := SubtractInt $totalGPUs (MultiplyInt $fillPodsPerNamespace $namespaces)}}
-{{$smallJobsPerNamespace := DivideInt $smallJobPodsCount $namespaces}}
+{{$calculatedSJPN := DivideInt $smallJobPodsCount $namespaces}}
+{{$maxSJPN := DefaultParam .CL2_MAX_SMALL_JOBS_PER_NAMESPACE 999999}}
+{{$smallJobsPerNamespace := MinInt $calculatedSJPN $maxSJPN}}
 {{$smallJobSize := 1}}
-{{$smallJobCompletions := 10}}
+{{$smallJobCompletions := DefaultParam .CL2_SMALL_JOB_COMPLETIONS 10}}
 {{$jobRunningTime := DefaultParam .CL2_JOB_RUNNING_TIME "30s"}}
 {{$ENABLE_EXTENDED_RESOURCES := DefaultParam .CL2_ENABLE_EXTENDED_RESOURCES false}}
+{{$deviceClassName := DefaultParam .CL2_DEVICE_CLASS_NAME "gpu.example.com"}}
 
 {{$extendedResourceName := ""}}
 {{if $ENABLE_EXTENDED_RESOURCES}}
@@ -46,10 +65,12 @@ tuningSets:
 
 
 dependencies:
-- name: Install dra-example-driver for test
+- name: Install dra-driver for test
   Method: DRATestDriver
   Params:
-    WorkerNodeCount: {{.Nodes}}
+    WorkerNodeCount: {{$totalResourceSliceCount}}
+    DaemonsetName: {{$draDaemonsetName}}
+    Manifests: {{$draManifests}}
     {{if $ENABLE_EXTENDED_RESOURCES}}
     ExtendedResourceName: {{$extendedResourceName}}
     {{end}}
@@ -70,12 +91,15 @@ steps:
         apiVersion: batch/v1
         kind: Job
         labelSelector: job-type = long-running
-        operationTimeout: 120s
+        operationTimeout: {{$RUNNING_JOBS_OPERATION_THRESHOLD}}
     - Identifier: FastFillPodStartupLatency
       Method: PodStartupLatency
       Params:
         action: start
         labelSelector: job-type = long-running
+        perc50Threshold: {{$UPSIZE_PERC50_THRESHOLD}}
+        perc90Threshold: {{$UPSIZE_PERC90_THRESHOLD}}
+        threshold: {{$UPSIZE_THRESHOLD}}
     - Identifier: FastFillClaimAllocationLatency
       Method: ResourceClaimAllocationLatency
       Params:
@@ -111,6 +135,8 @@ steps:
       objectBundle:
       - basename: single-gpu
         objectTemplatePath: "resourceclaimtemplate.yaml"
+        templateFillMap:
+          DeviceClassName: {{$deviceClassName}}
 {{end}}
 - name: Fill cluster to {{$fillPercentage}}% utilization
   phases:
@@ -134,7 +160,7 @@ steps:
       Params:
         action: gather
         labelSelector: job-type = long-running
-        timeout: 15m
+        timeout: {{$RUNNING_JOBS_THRESHOLD}}
 - name: Gather measurements for long running pods
   measurements:
     - Identifier: FastFillSchedulingMetrics
@@ -145,6 +171,9 @@ steps:
       Method: PodStartupLatency
       Params:
         action: gather
+        perc50Threshold: {{$UPSIZE_PERC50_THRESHOLD}}
+        perc90Threshold: {{$UPSIZE_PERC90_THRESHOLD}}
+        threshold: {{$UPSIZE_THRESHOLD}}
     - Identifier: FastFillClaimAllocationLatency
       Method: ResourceClaimAllocationLatency
       Params:
@@ -164,9 +193,9 @@ steps:
       Params:
         action: start
         labelSelector: job-type = short-lived
-        perc50Threshold: 40s
-        perc90Threshold: 60s
-        perc99Threshold: 80s
+        perc50Threshold: {{$CHURN_POD_STARTUP_PERC50_THRESHOLD}}
+        perc90Threshold: {{$CHURN_POD_STARTUP_PERC90_THRESHOLD}}
+        perc99Threshold: {{$CHURN_POD_STARTUP_PERC99_THRESHOLD}}
     - Identifier: ChurnClaimAllocationLatency
       Method: ResourceClaimAllocationLatency
       Params:
@@ -210,7 +239,7 @@ steps:
       Params:
         action: gather
         labelSelector: job-type = short-lived
-        timeout: 15m
+        timeout: {{$FINISHED_JOBS_THRESHOLD}}
 - name: Measure scheduler metrics
   measurements:
     - Identifier: ChurnSchedulingMetrics
@@ -221,14 +250,14 @@ steps:
       Method: PodStartupLatency
       Params:
         action: gather
-        perc50Threshold: 40s
-        perc90Threshold: 60s
-        perc99Threshold: 80s
+        perc50Threshold: {{$CHURN_POD_STARTUP_PERC50_THRESHOLD}}
+        perc90Threshold: {{$CHURN_POD_STARTUP_PERC90_THRESHOLD}}
+        perc99Threshold: {{$CHURN_POD_STARTUP_PERC99_THRESHOLD}}
     - Identifier: ChurnClaimAllocationLatency
       Method: ResourceClaimAllocationLatency
       Params:
         action: gather
     - Identifier: ChurnDRAMetrics
       Method: GenericPrometheusQuery
       Params:
-        action: gather
+        action: gather
diff --git a/clusterloader2/testing/dra/job.yaml b/clusterloader2/testing/dra/job.yaml
@@ -9,7 +9,9 @@ spec:
   parallelism: {{.Replicas}}
   completions: {{.CompletionReplicas}}
   completionMode: {{.Mode}}
-  ttlSecondsAfterFinished: 300
+  # In tests involving a large number of sequentially created, short-lived jobs, the spin-up time may be significant.
+  # A TTL of 1 hour should be sufficient to retain the jobs long enough for measurement checks.
+  ttlSecondsAfterFinished: 3600 # 1 hour
   template:
     metadata:
       labels:
diff --git a/clusterloader2/testing/dra/resourceclaimtemplate.yaml b/clusterloader2/testing/dra/resourceclaimtemplate.yaml
@@ -8,4 +8,4 @@ spec:
       requests:
         - name: gpu
           exactly:
-            deviceClassName: gpu.example.com
+            deviceClassName: {{.DeviceClassName}}